\documentclass[a4paper,12pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage[utf8]{inputenc}
\usepackage{mathpazo}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{natbib}
\usepackage{amsmath}
\usepackage{setspace}
\usepackage[normalem]{ulem}
\usepackage{sectsty}
\subsectionfont{\normalfont\large\underline}
\usepackage{floatrow}
\usepackage[hidelinks]{hyperref}
\floatsetup[figure]{capposition=top}
\floatsetup[table]{capposition=top}

\title{Measuring Subgroup Preferences in Conjoint Experiments\footnote{We thank Benjamin Lauderdale, Jamie Druckman, Yusaku Horiuchi, the editor, and anonymous reviewers for feedback on this manuscript. Replication data and code for this article are available from the \textit{Political Analysis} Dataverse: \url{https://doi.org/10.7910/DVN/ARHZU4}. This work was funded, in part, by the United Kingdom Economic and Social Research Council (Grant ES/R000573/1).}
}
\author{Thomas J. Leeper, Sara B. Hobolt, and James Tilley}

\begin{document}

\maketitle

{\abstract Conjoint analysis is a common tool for studying political preferences. The method disentangles patterns in respondents' favorability toward complex, multidimensional objects, such as candidates or policies. Most conjoints rely upon a fully randomized design to generate average marginal component effects (AMCEs). These measure the degree to which a given value of a conjoint profile feature increases, or decreases, respondents' support for the overall profile relative to a baseline, averaging across all respondents and other features. While the AMCE has a clear causal interpretation (about the \textit{effect} of features), most published conjoint analyses also use AMCEs to describe \textit{levels} of favorability. This often means comparing AMCEs among respondent subgroups. We show that using conditional AMCEs to describe the degree of subgroup agreement can be misleading as regression interactions are sensitive to the reference category used in the analysis. This leads to inferences about subgroup differences in preferences that have arbitrary sign, size, and significance. We demonstrate the problem using examples drawn from published articles and provide suggestions for improved reporting and interpretation using marginal means and an omnibus F-test. Given the accelerating use of these designs in political science, we offer advice for best practice in analysis and presentation of results.}


<<knitr_options, cache=FALSE, echo=FALSE, results="hide">>=
knitr::opts_chunk$set(
  cache = TRUE,
  echo=FALSE,
  results="hold",
  warning=FALSE,
  error=FALSE,
  message=FALSE
)
options(scipen = 10)
@

<<setup, cache=FALSE>>=
library("rio")
library("cregg")
library("ggplot2")
library("gridExtra")
requireNamespace("xtable", quietly = TRUE)
@

<<data_hainmueller_candidate>>=
# Hainmueller et al. candidate experiment
hainmueller_candidate <- cj_df(rio::import("data/hainmueller-candidate.dta"))

# levels and labels
hainmueller_candidate$atmilitary <- rio::factorize(hainmueller_candidate$atmilitary)
attr(hainmueller_candidate$atmilitary, "label") <- "Military Service"

hainmueller_candidate$atreligion <- rio::factorize(hainmueller_candidate$atreligion)
attr(hainmueller_candidate$atreligion, "label") <- "Religion"

hainmueller_candidate$ated <- rio::factorize(hainmueller_candidate$ated)
attr(hainmueller_candidate$ated, "label") <- "College"

hainmueller_candidate$atprof <- rio::factorize(hainmueller_candidate$atprof)
attr(hainmueller_candidate$atprof, "label") <- "Profession"

hainmueller_candidate$atinc <- rio::factorize(hainmueller_candidate$atinc)
attr(hainmueller_candidate$atinc, "label") <- "Income"

hainmueller_candidate$atrace <- rio::factorize(hainmueller_candidate$atrace)
attr(hainmueller_candidate$atrace, "label") <- "Race/Ethnicity"

hainmueller_candidate$atage <- rio::factorize(hainmueller_candidate$atage)
attr(hainmueller_candidate$atage, "label") <- "Age"

hainmueller_candidate$atmale <- rio::factorize(hainmueller_candidate$atmale)
attr(hainmueller_candidate$atmale, "label") <- "Gender"
@

<<data_hainmueller_immigration>>=
# Hainmueller et al. immigration experiment
hainmueller_immigration <- cj_df(rio::import("data/hainmueller-immigrant.dta"))

## outcome
hainmueller_immigration$ChosenImmigrant <- hainmueller_immigration$Chosen_Immigrant
hainmueller_immigration$Chosen_Immigrant <- NULL

## setup features
hainmueller_immigration$Gender <- rio::factorize(hainmueller_immigration$FeatGender)
attr(hainmueller_immigration$Gender, "label") <- "Gender"
hainmueller_immigration$FeatGender <- NULL

hainmueller_immigration$Education <- rio::factorize(hainmueller_immigration$FeatEd)
levels(hainmueller_immigration$Education) <- sub(" in the US", "", levels(hainmueller_immigration$Education))
attr(hainmueller_immigration$Education, "label") <- "Education"
hainmueller_immigration$FeatEd <- NULL

hainmueller_immigration$LanguageSkills <- rio::factorize(hainmueller_immigration$FeatLang)
levels(hainmueller_immigration$LanguageSkills) <- sub("During( the)? admission interview, this applicant ", "", levels(hainmueller_immigration$LanguageSkills))
attr(hainmueller_immigration$LanguageSkills, "label") <- "Language Skills"
hainmueller_immigration$FeatLang <- NULL

hainmueller_immigration$CountryOfOrigin <- rio::factorize(hainmueller_immigration$FeatCountry)
hainmueller_immigration$CountryOfOrigin <- relevel(hainmueller_immigration$CountryOfOrigin, "India")
attr(hainmueller_immigration$CountryOfOrigin, "label") <- "Country of Origin"
hainmueller_immigration$FeatCountry <- NULL

hainmueller_immigration$Job <- rio::factorize(hainmueller_immigration$FeatJob)
attr(hainmueller_immigration$Job, "label") <- "Job"
hainmueller_immigration$FeatJob <- NULL

hainmueller_immigration$JobExperience <- rio::factorize(hainmueller_immigration$FeatExp)
attr(hainmueller_immigration$JobExperience, "label") <- "Job Experience"
hainmueller_immigration$FeatExp <- NULL

hainmueller_immigration$JobPlans <- rio::factorize(hainmueller_immigration$FeatPlans)
levels(hainmueller_immigration$JobPlans) <- sub(" but has done job interviews", "", levels(hainmueller_immigration$JobPlans))
attr(hainmueller_immigration$JobPlans, "label") <- "Job Plans"
hainmueller_immigration$FeatPlans <- NULL

hainmueller_immigration$ReasonForApplication <- rio::factorize(hainmueller_immigration$FeatReason)
attr(hainmueller_immigration$ReasonForApplication, "label") <- "Reason for Application"
hainmueller_immigration$FeatReason <- NULL

hainmueller_immigration$PriorEntry <- rio::factorize(hainmueller_immigration$FeatTrips)
attr(hainmueller_immigration$PriorEntry, "label") <- "Prior Entry"
hainmueller_immigration$FeatTrips <- NULL

## covariates
hainmueller_immigration$ethnocentrism_split <- ifelse(is.na(hainmueller_immigration$ethnocentrism), NA_real_, ifelse(hainmueller_immigration$ethnocentrism > 10, 1, 0))
hainmueller_immigration$ethnocentrism_split <- factor(hainmueller_immigration$ethnocentrism_split, levels = c(0,1), labels = c("low", "high"))

## constraints
hainmueller_immigration$constraint1constrained <- ifelse(
  (hainmueller_immigration$CountryOfOrigin %in% c("India", "Germany", "France", "Mexico", "Philippines", "Poland")) & (hainmueller_immigration$ReasonForApplication %in% c("Seek better job in U.S.", "Reunite with family members already in the U.S."))
, TRUE, FALSE)
hainmueller_immigration$constraint1unconstrained <- ifelse(
  hainmueller_immigration$CountryOfOrigin %in% c("China", "Sudan", "Somalia", "France")
, TRUE, FALSE)
hainmueller_immigration$constraint2constrained <- ifelse(
  (hainmueller_immigration$Job %in% c("Financial analyst", "Computer programmer", "Research scientist", "Doctor")) & (!hainmueller_immigration$Education %in% c("No formal education", "Equivalent to completing fourth grade", "Equivalent to completing eighth grade", "Equivalent to completing high school"))
, TRUE, FALSE)
hainmueller_immigration$constraint2unconstrained <- ifelse(
  hainmueller_immigration$Job %in% c("Janitor", "Waiter", "Child care provider", "Gardener", "Construction worker", "Teacher", "Nurse")
, TRUE, FALSE)
@

<<data_bechtel>>=
if (file.exists("data/bms.rds")) {
    bechtel <- cj_df(rio::import("data/bechtel.rds"))
} else {
	# load original data file
	bechtel <- cj_df(rio::import("data/bechtel_scheve_pnas.dta"))
	
	# conjoint features
	tmp <- attr(bechtel[["cost_cj"]], "label")
	bechtel[["cost_cj"]] <- rio::factorize(bechtel[["cost_cj"]])
	levels(bechtel[["cost_cj"]]) <- gsub("^.{1}", "€", levels(bechtel$cost_cj))
	attr(bechtel[["cost_cj"]], "label") <- tmp
	rm(tmp)
	
	tmp <- attr(bechtel[["distrib_cj"]], "label")
	bechtel[["distrib_cj"]] <- rio::factorize(bechtel[["distrib_cj"]])
	attr(bechtel[["distrib_cj"]], "label") <- tmp
	rm(tmp)
	
	tmp <- attr(bechtel[["ctries_cj"]], "label")
	bechtel[["ctries_cj"]] <- rio::factorize(bechtel[["ctries_cj"]])
	attr(bechtel[["ctries_cj"]], "label") <- tmp
	rm(tmp)
	
	tmp <- attr(bechtel[["emissions_cj"]], "label")
	bechtel[["emissions_cj"]] <- rio::factorize(bechtel[["emissions_cj"]])
	attr(bechtel[["emissions_cj"]], "label") <- tmp
	rm(tmp)
	
	tmp <- attr(bechtel[["sanctions_cj"]], "label")
	bechtel[["sanctions_cj"]] <- rio::factorize(bechtel[["sanctions_cj"]])
	levels(bechtel[["sanctions_cj"]])[2:4] <- gsub("^.{1}", "€", levels(bechtel$sanctions_cj)[2:4])
	attr(bechtel[["sanctions_cj"]], "label") <- tmp
	rm(tmp)
	
	tmp <- attr(bechtel[["monitoring_cj"]], "label")
	bechtel[["monitoring_cj"]] <- rio::factorize(bechtel[["monitoring_cj"]])
	attr(bechtel[["monitoring_cj"]], "label") <- tmp
	rm(tmp)
	
	tmp <- attr(bechtel[["country"]], "label")
	bechtel[["country"]] <- rio::factorize(bechtel[["country"]])
	attr(bechtel[["country"]], "label") <- tmp
	rm(tmp)
	
	# covariates
	## environmentalism
	tmp <- attr(bechtel[["support_iec_high_group"]], "label")
	bechtel[["environmentalism"]] <- rio::factorize(bechtel[["support_iec_high_group"]])
	levels(bechtel[["environmentalism"]]) <- paste0("Environmentalism: ", levels(bechtel[["environmentalism"]]))
	attr(bechtel[["environmentalism"]], "label") <- tmp
	rm(tmp)
	
	## reciprocity
	tmp <- attr(bechtel[["recip_s_high_group"]], "label")
	bechtel[["reciprocity"]] <- rio::factorize(bechtel[["recip_s_high_group"]])
	levels(bechtel[["reciprocity"]]) <- paste0("Reciprocity: ", levels(bechtel[["reciprocity"]]))
	attr(bechtel[["reciprocity"]], "label") <- tmp
	rm(tmp)

    # export
    rio::export(bechtel, "data/bechtel.rds")    
}
@

<<data_tkr>>=
if (file.exists("data/tkr.rds")) {
    tkr <- cj_df(rio::import("data/tkr.rds"))
} else {
    # load original data file
    tkr <- cj_df(rio::import("data/teele.dta"))
    
    # recode respondent characteristics
    tkr[["sample"]] <- c("usa voter" = "Voter", "usa leg" = "Legislator")[tkr[["sample"]]]
    
    # subset to voters
    tkr <- tkr[tkr[["sample"]] == "Voter", ]
    
    tkr[["Sex"]] <- factor(c("Male", "Female")[tkr[["female_respondent"]]+1L])
    
    tkr[["PartyID"]] <- NA_integer_
    tkr[["PartyID"]][tkr[["democrat_respondent"]] == 1] <- "Democrat"
    tkr[["PartyID"]][tkr[["republican_respondent"]] == 1] <- "Republican"
    tkr[["PartyID"]] <- factor(tkr[["PartyID"]])
    
    # label features and feature levels
    ## Sex
    tkr[["feature_sex"]] <- factor(tkr[["orig_cand_female"]], c(0,1), c("Male", "Female"))
    attr(tkr[["feature_sex"]], "label") <- "Candidate Sex"
    
    ## Political experience
    tkr[["feature_experience"]] <- NA_integer_
    tkr[["feature_experience"]][tkr[["orig_0ys"]] == 1] <- 1L
    tkr[["feature_experience"]][tkr[["orig_1ys"]] == 1] <- 2L
    tkr[["feature_experience"]][tkr[["orig_3ys"]] == 1] <- 3L
    tkr[["feature_experience"]][tkr[["orig_8ys"]] == 1] <- 4L
    tkr[["feature_experience"]] <- factor(tkr[["feature_experience"]], 1:4, c("None", "1 year", "3 years", "8 years"))
    attr(tkr[["feature_experience"]], "label") <- "Political Experience"
    
    ## Martial status
    tkr[["feature_marital"]] <- NA_integer_
    tkr[["feature_marital"]][tkr[["orig_UN_sp"]] == 1] <- 1L
    tkr[["feature_marital"]][tkr[["orig_FM_sp"]] == 1] <- 2L
    tkr[["feature_marital"]][tkr[["orig_MD_sp"]] == 1] <- 3L
    tkr[["feature_marital"]] <- factor(tkr[["feature_marital"]], 1:3, c("Unmarried", "Doctor Spouse", "Farmer Spouse"))
    attr(tkr[["feature_marital"]], "label") <- "Martial Status"
    
    ## Profession
    tkr[["feature_job"]] <- NA_integer_
    tkr[["feature_job"]][tkr[["orig_teach"]] == 1] <- 1L
    tkr[["feature_job"]][tkr[["orig_law"]] == 1] <- 2L
    tkr[["feature_job"]][tkr[["orig_may"]] == 1] <- 3L
    tkr[["feature_job"]][tkr[["orig_leg"]] == 1] <- 4L
    tkr[["feature_job"]] <- factor(tkr[["feature_job"]], 1:4, c("Teacher", "Corporate Lawyer", "Mayor", "State Legislator"))
    attr(tkr[["feature_job"]], "label") <- "Job"
        
    ## Children
    tkr[["feature_children"]] <- NA_integer_
    tkr[["feature_children"]][tkr[["orig_0ch"]] == 1] <- 1L
    tkr[["feature_children"]][tkr[["orig_1ch"]] == 1] <- 2L
    tkr[["feature_children"]][tkr[["orig_3ch"]] == 1] <- 3L
    tkr[["feature_children"]] <- factor(tkr[["feature_children"]], 1:3, c("No children", "1 child", "3 children"))
    attr(tkr[["feature_children"]], "label") <- "Children"
    
    ## Age
    tkr[["feature_age"]] <- NA_integer_
    tkr[["feature_age"]][tkr[["orig_29"]] == 1] <- 1L
    tkr[["feature_age"]][tkr[["orig_45"]] == 1] <- 2L
    tkr[["feature_age"]][tkr[["orig_65"]] == 1] <- 3L
    tkr[["feature_age"]] <- factor(tkr[["feature_age"]], 1:3, c("29", "45", "65"))
    attr(tkr[["feature_age"]], "label") <- "Age"
    
    ## Office (experimental factor not reported in paper?)
    tkr[["feature_office"]] <- rio::factorize(tkr[["dv"]])
    attr(tkr[["feature_office"]], "label") <- "Office"
        
    # export
    rio::export(tkr, "data/tkr.rds")
}
@

\clearpage
\doublespacing

One aspect of the dramatic increase in the use of experiments within political science \citep{Druckmanetal2006, Mutz2011} is the establishment of conjoint experimental designs as a prominent methodological tool. While survey experiments have traditionally examined just one or two factors that might shape outcomes \citep[see, for reviews,][]{GainesKuklinskiQuirk2007, Sniderman2011}, conjoint designs allow researchers to study the independent effects on preferences of many features of complex, multidimensional objects. These include many different types of phenomena, such as political candidates \citep{Campbelletal2016, TeeleKallaRosenbluth2018}, immigrant admissions \citep{HainmuellerHopkins2015, BansakHainmuellerHangartner2016, WrightLevyCitrin2016}, and public policies \citep{GallegoMarx2017, Hankinson2018}. Factorial designs of this sort have a long history, but the driving force behind this use of conjoint analysis has been the introduction by \citet{HainmuellerHopkinsYamamoto2014} of a small-sample, fully randomized conjoint design. The associated analytic approach emphasizes a single quantity of interest: the average marginal component effect (AMCE). By capturing the multidimensionality of target objects, the randomized conjoint design breaks any explicit, or implicit, confounding between features of these objects. This gives the AMCE a clear causal interpretation: the degree to which a given value of a feature increases, or decreases, respondents' favorability towards a packaged conjoint profile relative to a baseline. 

While randomization of profile features gives the AMCE a causal interpretation, most published conjoint analyses in political science use AMCEs not only for \textit{causal} purposes (interpreting AMCEs as effect sizes), but also for \textit{descriptive} purposes. The aim is to map levels of favorability toward a multidimensional object across its various features.\footnote{See \citet{Shmueli2010} for an elaboration on the distinctions between explanatory (causal) modelling, descriptive modelling, and predictive modelling.} In this sense, conjoints are often applied like list experiments, using randomization to measure a sample's preferences over something difficult to measure with direct questioning. A positive AMCE for a given feature can be read as a descriptive measure of high favorability towards profiles with that feature. The quantity is causal, but it is often read descriptively.

This is particularly the case for subgroup analyses of conjoint experiments. Such exercises are an increasingly common feature of experimental analysis \citep{GreenKern2012, RatkovicTingley2017, GrimmerMessingWestwood2017, EgamiImai2018}. For example, the \citet{HainmuellerHopkinsYamamoto2014} study of immigration attitudes splits the sample in two using a measure of ethnocentrism and then compares AMCEs for the two subgroups. Similarly, \citet{BansakHainmuellerHangartner2016} compare preferences toward immigrants across number of binary respondent characteristics: age, education, left-right ideology, and income. Other examples abound. \citet{BallardRosaMartinScheve2016} compare preferences over tax policies across a number of subgroups defined by demographics and political orientations; \citet{BechtelScheve2013} compare AMCEs on climate agreements across four different countries, and across subgroups of respondents; and \citet{TeeleKallaRosenbluth2018} compare AMCEs for features of male and female political candidates among male and female respondents. Most of these comparisons are visual or informal. But some involve explicit estimation of the subgroup difference, such as when \citet{KirklandCoppock2017} compare conditional AMCEs across hypothetical partisan and nonpartisan elections. Interpretation of subgroup AMCEs thus involves an implied quantity of interest: the \textit{difference} between two conditional AMCEs.

What is not necessarily obvious in such analyses is that differences-in-preferences (that is to say, the difference in degree of favorability toward profiles containing a given feature) are not directly reflected in subgroup differences-in-AMCEs. A difference in effect sizes is distinct from a difference in preferences. We show that a difference in two (or more) subgroups' favorability toward a conjoint feature --- like a difference in willingness to support a particular type of immigrant between high and low ethnocentrism respondents --- is only rarely reflected in the difference-in-AMCEs. In fact, no information about the similarity of the subgroups' preferences is provided by comparisons of subgroup AMCEs, yet such comparisons are commonly made in practice.

As we will show, where preferences in subgroups toward the experimental reference category are similar, the difference-in-AMCEs conveys preferences reasonably well. The problem occurs when preferences between subgroups diverge in the reference category. Here, the difference-in-AMCEs is a misleading representation of underlying patterns of favorability. Given most published conjoint studies report results based upon reference categories chosen for \textit{substantive} reasons about the nature or meaning of the levels rather than the configuration of preferences revealed in the experiment, difference-in-AMCEs should not be assumed to be interpretable as differences in subgroup preferences. The root of this error is likely familiar to many researchers: it is simply a matter of regression specification for models involving interactions between categorical regressors. \citet{EgamiImai2018}, for example, provide an extensive discussion of the implications of this property for interpreting causal interactions between randomized features of conjoint profiles. The state of the published literature would suggest the problem remains non-obvious when applied to descriptive analysis of subgroups in conjoint designs.\footnote{Since this manuscript has been under review, we have been made aware of one working paper by \citet{ClaytonFerwerdaHoriuchi2018}, on the topic of immigration preferences, that correctly notes the need to address the arbitrary reference category in order to compare subgroup preferences.}

In what follows, we demonstrate the challenges of conjoint analysis and remind readers of how reference category choice for profile features creates problems for comparing conditional AMCEs across respondent subgroups. We show how the use of an arbitrary reference category means the size, direction, and statistical significance of differences-in-AMCEs have little relationship to the underlying degree of favorability of the subgroups toward profiles with particular features. Reference category choices can make similar preferences look dissimilar and dissimilar preferences look similar. We demonstrate this with examples drawn from the published political science literature (namely experiments by \citealt{HainmuellerHopkinsYamamoto2014, BechtelScheve2013, TeeleKallaRosenbluth2018}). The paper then provides suggestions for improved conjoint reporting and interpretation based around two quantities of interest drawn from the factorial experimentation literature: (a) unadjusted marginal means, a quantity measuring favorability toward a given feature, and (b) an omnibus F-test, measuring differences therein. Software for the R programming language to support our findings --- and that can be used to examine sensitivity of conjoint analysis to reference category selection, calculate AMCEs and marginal means, perform subgroup analyses, and test for subgroup differences in any conjoint experiment \citep{Leeper2018cregg} --- is demonstrated throughout using example data \citep{LeeperHoboltTilley2019}. We conclude with advice for best practices in the analysis and presentation of conjoint results.


\section*{Quantities of Interest in Conjoint Experiments}\label{sec:quantities}

Conjoint analysis serves two purposes. One is to assess causal effects. Another is preference description.\footnote{Here we use ``preference'' as \citet{HainmuellerHopkinsYamamoto2014} do: that is, as a statement of \textit{favorability} or \textit{support} for a profile, not the more narrow economic definition of a strict rank ordering of objects by favorability.} In causal inference, fully randomized conjoints provide a design and analytic approach that allows researchers to understand the causal effect of a given feature on overall support for a multidimensional object, averaging across other features of the object included in the design. Such inferences can be thought of as statements of the form: ``shifting an immigrant's country of origin from India to Poland increases favorability by X percentage points.'' In descriptive inference, conjoints provide information about both (a) the \textit{absolute} favorability of respondents toward objects with particular features or combinations of features, and (b) the \textit{relative} favorability of respondents toward an object with alternative combinations of features. Such inferences can be thought of as statements of the form ``Polish immigrants are preferred by X\% of respondents'' or ``Polish immigrants are more supported than Mexican immigrants, by X percentage points.'' Thus both causal and descriptive interpretations of conjoints are based upon the distribution of preferences across profile features and differences in preferences across alternative feature combinations.

Analytically, a fully randomized conjoint design without constraints between profile features is simply a full-factorial experiment (with some cells possibly, albeit randomly, left unobserved). All quantities of interest relevant to the analysis of conjoint designs therefore derive from combinations of cell means, marginal means, and the grand mean, as in the traditional analysis of factorial experiments. In a forced choice conjoint design, the \textit{grand mean} is by definition 0.5 (i.e., 50\% of all profiles shown are chosen and 50\% are not chosen). \textit{Cell means} are the mean outcome for each particular combination of feature levels. In the full-factorial design discussed by \citet{HainmuellerHopkinsYamamoto2014} and now widely used in political science, many or perhaps most cell means are unobserved. For example, in their candidate choice experiment, there are $2*6*6*6*2*6*6*6 = 186,624$ cell means, but only 3,466 observations. About 98\% of cell means are unobserved. While this would be problematic for attempting to infer pairwise comparisons between cells, conjoint analysts mostly focus on the marginal effects of each feature rather than more complex interactions. Appendix \ref{app:quantities} provides detailed notation and elaborations of these definitions of quantities of interest.

In fully randomized designs, the average marginal component effects (AMCEs) are simply marginal effects of changing one feature level to another, all else constant. AMCEs therefore depend only upon \textit{marginal means}: that is the column and row mean outcomes for each feature level averaging across all other features. A marginal mean describes the level of favorability toward profiles that have a particular feature level, ignoring all other features. For example, in the common forced-choice design with two alternatives, marginal means have a direct interpretation as probabilities. A marginal mean of 0 indicates respondents select profiles with that feature level with probability $P(Y=1|X=x) = 0$. While a marginal mean of 1 indicates respondents select profiles with that feature level with probability $P(Y=1|X=x)=1$, where $Y$ is a binary outcome and $X$ is a vector of profile features.\footnote{It is not possible for the marginal mean to equal zero or one if pairs of profiles shown together are allowed to have the same level of a given feature (for example, both immigrants are from Germany). Instead, the marginal mean can range from the probability of co-occurrence to 1 minus that probability. If there are five levels of a feature, each shown with equal probability, then the probability of co-occurrence is $\frac{1}{5}*\frac{1}{5} = 0.04$ such that the marginal mean can take values in the range $(0.04,0.96)$. If the design is constrained so that features cannot be the same for both immigrants, then the marginal means fully range from zero to one. This constraint on the range of the marginal means also constrains the range of AMCEs. Notably, many conjoints provide features with only two levels, such as the male-versus-female candidate feature examined by \citet{TeeleKallaRosenbluth2018} or \citet{HainmuellerHopkinsYamamoto2014} in their conjoints on candidate choice. In such cases, the probability of co-occurrence is $\frac{1}{2}*\frac{1}{2} = 0.25$ bounding the AMCE for female (as opposed to male) candidates to the range $(-0.5, 0.5)$ if both candidates can have the same sex. Caution is therefore needed in comparing the relative size of features with few levels to features with many levels given that effects have different bounds.} With rating scale outcomes, marginal means can vary arbitrarily along the outcome scale used.


<<hainmueller_candidate_replication, dependson=c("data_hainmueller_candidate"), fig.height=8, fig.width=10, fig.cap="Replication of Hainmueller et al. (2014) Candidate Experiment using AMCEs and MMs", fig.show="hold", message=FALSE>>=
p1 <- plot(
  cregg::cj(
   hainmueller_candidate, 
   selected ~ atmilitary + atreligion + ated + atprof + atinc + atrace + atage + atmale,
   id = ~ resID,
   estimate = "amce"
  ), vline = 0
) + 
  ggplot2::scale_x_continuous(
    limits = c(-0.4, 0.4), 
    breaks = c(-0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4)
  ) + 
  ggplot2::geom_text(
    aes(label = ifelse(is.na(std.error), "", sprintf("%0.2f (%0.2f)", estimate, std.error))),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
 ) + 
 theme(legend.position="none")
p2 <- plot(
  cregg::cj(
   hainmueller_candidate, 
   selected ~ atmilitary + atreligion + ated + atprof + atinc + atrace + atage + atmale,
   id = ~ resID,
   estimate = "mm"
  ), vline = 0.5
) + 
  ggplot2::scale_x_continuous(
    limits = c(0.1, 0.9), 
    breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
  ) + 
  ggplot2::geom_text(
    aes(label = sprintf("%0.2f (%0.2f)", estimate, std.error)),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
  ) + 
  theme(legend.position="none")
gridExtra::grid.arrange(p1, p2, ncol=2)
@

Because levels of features are randomly assigned, pairwise differences between two marginal means for a given feature (e.g., between candidates who are male versus female) have a direct causal interpretation. For fully randomized designs, the AMCE proposed by \citet{HainmuellerHopkinsYamamoto2014} is equivalent to the average marginal effect of each feature level for a model where each feature is converted into a matrix of indicator variables with one level left out as a reference category. This is no different from any other regression context wherein one level of any categorical variable must be omitted from the design matrix in order to avoid perfect multicollinearity.\footnote{In designs that entail constraints between profile features, the average marginal effect is a weighted average of effects across each combination of the constrained features where the weights on the effects are arbitrary but typically uniform. We ignore this distinction in the remainder of this article, as all of our results apply equally to fully randomized and to constrained designs.} This close relationship between AMCEs and marginal means is visible in Figure \ref{fig:hainmueller_candidate_replication} which presents a replication of the AMCE-based analysis of the Hainmueller et al. candidate experiment (left panel) and an analogous examination of the results using marginal means (right panel). Note, in particular, how marginal means convey information about the preferences of respondents for all feature levels while AMCEs definitionally restrict the AMCE for the reference category to zero (or undefined). For example, the AMCE for a candidate serving in the military is 0.09 (or a 9-percentage point) increase in favorability, reflecting marginal means for serving and non-serving candidates of 0.46 and 0.54, respectively. Similarly, the zero effect size for candidate gender reflects identical marginal means for male and female candidates (0.50 in each case). AMCEs in fully randomized designs are simply differences between marginal means at each feature level and the marginal mean in the reference category, ignoring other features.

The AMCE is often described as an estimate of the relative favorability of profiles with counterfactual levels of a feature. For example, \citet{TeeleKallaRosenbluth2018} summarize their conjoint on public support ``female candidates are favored [over men] by 7.3 percentage points'' (6). Similarly, \citet{HainmuellerHopkinsYamamoto2014} describe some of the results of conjoint on preferences toward political candidates:

\begin{quote}
We also see a bias against Mormon candidates, whose estimated level of support is 0.06 (SE = 0.03) lower when compared to a baseline candidate with no stated religion. Support for Evangelical Protestants is also 0.04 percentage points lower (SE = 0.02) than the baseline. (19)
\end{quote}

\noindent These examples make clear that despite the \textit{causal} inference potentially provided by the AMCE, the quantity of interest is frequently used to provide a characterization of a preferences that has a distinctly descriptive flavor about the relative \textit{levels} of support across profiles and also across subgroups of respondents. Indeed, this style of description is widespread in conjoint analyses. This use of conjoints to provide descriptive inferences about patterns of preferences is important because AMCEs are defined as \textit{relative} quantities, requiring that patterns of preferences are expressed against a baseline, reference category for each conjoint feature. A positive AMCE is read as higher favorability but it is only higher relative to whatever category serves as the baseline. For example, in the \citeauthor{HainmuellerHopkinsYamamoto2014} candidate example, choosing a non-religious candidate as a baseline and interpreting the resulting AMCES means that the differences between other pairs of marginal means (e.g., evaluations of Mormon and Evangelical candidates) are not obvious. The negative direction, and the size, of the AMCEs for Mormon and Evangelical candidates would be different if the least-liked category of Mormons were the reference group. More trivially, \citet{TeeleKallaRosenbluth2018} describe their comparisons about public preferences for female candidates relative to male candidates, but could have equivalent described patterns of equal size but opposite sign comparing preferences over male relative to female candidates. Appendix \ref{app:referencecategory} includes some additional illustrations of this point for interested readers.

\section*{Consequences of Arbitrary Reference Category Choice}\label{sec:challenges}

How do researchers decide which of tens of thousands of possible experimental cells should be selected as the reference category? Examining recently published conjoint analyses, it appears that the choice of reference category is either arbitrary or based upon substantive intuition about the meaning of feature levels. For example, \citet{HainmuellerHopkinsYamamoto2014} choose female immigrants as a baseline in their immigration experiment, thus providing an estimate of the AMCE of being male, while \citet{TeeleKallaRosenbluth2018} choose male candidates as a baseline in their conjoint, thus providing an estimate of the AMCE of being female. The choice is seemingly innocuous. Sometimes choices of reference category appear to be driven by substantive knowledge: on language skills of immigrants in their immigration experiment, \citet{HainmuellerHopkinsYamamoto2014} choose fluency as a baseline; on the prior trips to the US feature, ``never'' is chosen as the baseline.

% Difference in conditional AMCEs
\begin{table}
\caption{Uses of Subgroup Analysis Published in Political Science Journals}\label{tab:papers}
\begin{center}
\scriptsize
\begin{tabular}{p{1.5in} p{1.5in} p{1.5in} p{1.5in}}\toprule
\textbf{Paper} & \textbf{Journal} & \textbf{Topic} & \textbf{Subgroup Comparisons} \\ \midrule
\citet{BechtelScheve2013} & PNAS & Climate agreement preferences & Environmentalism and International Reciprocity Attitudes \\ \midrule
\citet{FranchinoZucchini2014} & PSRM & Candidate preferences & Political Interest, Left-right self-placement\\ \midrule
\citet{HainmuellerHopkinsYamamoto2014} & Political Analysis & Immigration preferences & Ethnocentrism \\ \midrule
\citet{HansenOlsenBech2014} & Politcal Behavior & Policy preferences & Partisanship \\ \midrule
\citet{Carlson2015} & World Politics & Candidate preferences & Co-ethnicity \\ \midrule
\citet{BansakHainmuellerHangartner2016} & Science & Immigration preferences & Left-right self-placement, age, education, income\\ \midrule
\citet{BallardRosaMartinScheve2016} & JOP & Tax preferences & Various\\ \midrule
\citet{Campbelletal2016} & BJPS & Candidate preferences & Partisanship \\ \midrule
\citet{CarnesLupu2016} & APSR & Candidate preferences & Partisanship \\ \midrule
\citet{Mummolo2016} & JOP & News selection & Various\\ \midrule
\citet{VivyanWagner2016} & EJPR & Candidate preferences & Political attitudes \\ \midrule
\citet{MummoloNall2017} & JOP & Mobility preferences & Partisanship \\ \midrule
\citet{BechtelGenoveseScheve2017} & BJPS & Climate agreement preferences & Employment sector emissions \\ \midrule
\citet{BechtelHainmuellerMargalit2017} & EJPR & International bailout preferences & Various\\ \midrule
\citet{GallegoMarx2017} & J. European Public Policy & Labor market policy & Left-right self-placement \\ \midrule
\citet{KirklandCoppock2017} & Political Behavior & Candidate preferences & Partisanship \\ \midrule
\citet{Sen2017} & PRQ & Judicial candidate preferences & Partisanship \\ \midrule
\citet{Sobolewskaetal2017} & J. Ethnic \& Migration Studies & Immigrant integration & Various \\ \midrule
\citet{EggersVivyanWagner2018} & JOP & Candidate preferences & Sex \\ \midrule
\citet{Hankinson2018} & APSR & Housing policy preferences & Various \\ \midrule
\citet{OliverosSchuster2018} & CPS & Bureaucrat candidate preferences & Various \\ \midrule
\citet{TeeleKallaRosenbluth2018} & APSR & Candidate preferences & Sex, Partisanship \\ \midrule
\citet{Careyetal2018} & Politics, Groups, and Identities & Hiring preferences & Various \\ \midrule
\bottomrule
\end{tabular}
\end{center}

All articles in this table use subgroup conditional AMCEs to make inferences about differences in preferences between subgroups.
\end{table}

While seemingly arbitrary and innocuous, the choice of reference category can provide highly distorted descriptive interpretations of preferences among subgroups of respondents. This occurs when researchers examine \textit{conditional} AMCEs, wherein AMCEs are calculated separately for subgroups of respondents and those conditional estimates are directly compared \citep[13]{HainmuellerHopkinsYamamoto2014}. Conditional AMCEs convey the causal effect of an experimental factor on overall favorability among the subgroup of interest. Consider, for example, a two-condition candidate choice experiment where Democratic and Republican respondents are exposed to either a male or female candidate and opinions toward the candidate serve as the outcome. It is reasonable to imagine that effects of candidate sex might differ for the two groups and therefore to compare the size of treatment between the two groups. Perhaps Democrats are more responsive to candidate sex than are Republicans, making the causal effect larger for Democrats than Republicans. When conjoint analysts engage in subgroup comparisons, they are engaging in this kind of search for heterogeneous treatment effects across subgroups, but across a much larger number of experimental factors.

As Table \ref{tab:papers} shows, discussions of conditional AMCEs in conjoint analyses often compare the size, and direction, of subgroup causal effects. Given the common practice of descriptively interpreting conjoint experimental results, such subgroup analyses seem perfectly intuitive. The set of subgroups listed in the last column of Table \ref{tab:papers} contains some unsurprising covariates, such as partisanship, that are of obvious theoretical interest in almost any study of individual preferences. If interpreted as a difference in the size of the \textit{causal effect} for two groups, such comparisons are perfectly consistent with more traditional experimental analysis and a perfectly acceptable interpretation of the conjoint results.

Yet, just as analysis of full sample conjoint data is often descriptive in nature, it is also the case that conjoint analysts frequently interpret differences in conditional AMCEs descriptively rather than causally. For example, in one analysis \citet{HainmuellerHopkinsYamamoto2014} visually compare the pattern of AMCEs among high- and low-ethnocentrism respondents and interpret that ``the patterns of support are generally similar for respondents irrespective of their level of ethnocentrism'' (22). \citet{BallardRosaMartinScheve2016} make similar comparisons in their tax policy conjoint: ``While there are few strong differences in preferences for taxing the lower three income groups (the `hard work' group has slightly lower elasticities for taxing the poor), there are strong differences in preferences for taxing the rich'' (12). In the \citet{BechtelScheve2013} conjoint on support for international climate change agreements in the United States, United Kingdom, Germany, and France, they summarize their results as ``We find that individuals in all four countries largely agree on which dimensions are important and to what extent'' (13765). In these examples, the differences between conditional AMCEs are used as a way of descriptively characterizing differences in \textit{preferences} (i.e. levels of support) between the groups rather than differences in \textit{causal effects on preferences} in the groups.

The selection of a reference category, while earlier an innocuous analytic decision, becomes substantially consequential for a descriptive reading of conditional AMCEs. Most obviously, using AMCEs descriptively prevents any description of the levels of favorability in the reference category. It can also lead to misinterpretations of patterns in preferences. AMCEs are relative, not absolute, statements about preferences. As such, there is simply no predictable connection between subgroup causal effects and the levels of underlying subgroup preferences. Yet analysts and their readers frequently interpret differences in conditional AMCEs as differences in underlying preferences. AMCEs do provide insight into the descriptive variation in preferences within-group and across-features, and conditional AMCEs do estimate the size of causal effects of features within groups. But AMCEs cannot provide direct insight into the pattern of preferences between groups because they do not provide information about \textit{absolute} levels of favorability toward profiles with each feature (or combination of features).

This additional information matters. Consider again the simple two-condition experiment in which the effect of a male as opposed to female candidate, $x \in {0,1}$, is compared across a single two-category covariate, $z \in {0,1}$ such as Democratic or Republican self-identification. Subgroup regression equations to estimate effects for each group are:

\begin{align*}
\hat{y} &= \beta_0 + \beta_1 x + \epsilon, \quad \forall z = 0 \\
\hat{y} &= \beta_2 + \beta_3 x + \epsilon, \quad \forall z = 1
\end{align*}

\noindent The effect of $x$ when $z=0$ is given by $\beta_1$. The effect of $x$ when $z=1$ is given by $\beta_3$. These are, in essence, the conditional AMCEs in a conjoint analysis. Yet the difference in AMCEs ($\beta_3 - \beta_1$) is not equal to the difference in preferences between the two groups, which is $\bar{y}_{z=1|x=1} - \bar{y}_{z=0|x=1}$ (estimated by $(\beta_2 + \beta_3) - (\beta_0 + \beta_1)$). The difference-in-AMCEs only equals the difference in preferences when $\beta_2 \equiv \beta_0$. Yet the standard AMCE-centric conjoint analysis does not present absolute favorability in the reference category. Similarity of conditional AMCEs only means similarity of the \textit{causal effect} of the feature across groups, not similarity of \textit{preferences} unless preferences toward profiles with the reference category are equivalent in both groups. Given the reference category choice is typically arbitrary or driven by substantive knowledge of the levels, there is never any reason to expect that the reference category satisfies this equality requirement. When using a difference-in-AMCEs comparison to estimate a difference in preferences, the size and direction of the bias is determined by the size of the difference in preferences toward the reference category within each subgroup.

<<tkr_replication, dependson=c("data_tkr"), fig.height=4.5, fig.width=5, fig.cap="Replication of Results for `Candidate Sex' Feature from Teele et al. (2018) Candidate Experiment using Full Sample AMCEs and MMs and Subgroup AMCEs and MMs for Democrats and Republicans", fig.show="hold", message=FALSE>>=
a <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_sex,
   id = ~ responseid,
   estimate = "amce",
  ), feature == "Candidate Sex"), 
  xlab = NULL, feature_headers = FALSE, vline = 0, xlim = c(-0.11, 0.11), size = 2
) + 
  ggtitle(NULL, "Full Sample AMCEs") + 
  scale_colour_manual("Feature", limits = "Candidate Sex", values = "black") +
  theme(
  	legend.position = "none",
  	panel.border = element_blank(),
  	axis.line.x = element_blank(),
	axis.line.y = element_blank()
  )

b <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_sex,
   id = ~ responseid,
   estimate = "mm",
  ), feature == "Candidate Sex"),
  xlab = NULL, feature_headers = FALSE, vline = 0.5, xlim = c(0.39,0.61), size = 2
) + 
  ggtitle(NULL, "Full Sample MMs") + 
  scale_colour_manual("Feature", limits = "Candidate Sex", values = "black") +
  theme(
   	legend.position = "none",
   	panel.border = element_blank(),
   	axis.line.x = element_blank(),
	axis.line.y = element_blank()
  )

c <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_sex,
   id = ~ responseid,
   estimate = "amce",
   by = ~ PartyID
  ), feature == "Candidate Sex"), 
  xlab = NULL, feature_headers = FALSE, vline = 0, group = "PartyID", xlim = c(-0.11, 0.11), size = 2
) + 
 annotate("text", y = 1.7, x = -0.04, label = "Republicans", size = 3) + 
 annotate("text", y = 1.35, x = 0.05, label = "Democrats", size = 3) + 
 ggtitle(NULL, "Subgroup AMCEs") + 
 scale_colour_manual(values=rep("black", 2)) +
 aes(shape = PartyID) + 
 scale_shape_manual("PartyID", limits = c("Democrat", "Republican"), values = c(15,17)) +
 theme(
   legend.position = "none",
   panel.border = element_blank(),
   axis.line.x = element_blank(),
   axis.line.y = element_blank()
 )

d <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_sex,
   id = ~ responseid,
   estimate = "mm",
   by = ~ PartyID
  ), feature == "Candidate Sex"),
  xlab = NULL, feature_headers = FALSE, vline = 0.5, group = "PartyID", xlim = c(0.39,0.61), size = 2
) + 
 annotate("text", y = 1.8, x = 0.48, label = "Rep.", size = 3) + 
 annotate("text", y = 0.8, x = 0.52, label = "Rep.", size = 3) + 
 annotate("text", y = 2.2, x = 0.55, label = "Dem.", size = 3) + 
 annotate("text", y = 1.2, x = 0.45, label = "Dem.", size = 3) + 
 ggtitle(NULL, "Subgroup MMs") + 
 scale_colour_manual(values=rep("black", 2)) +
 aes(shape = PartyID) + 
 scale_shape_manual("PartyID", limits = c("Democrat", "Republican"), values = c(15,17)) +
 theme(
  legend.position = "none",
  panel.border = element_blank(),
  axis.line.x = element_blank(),
  axis.line.y = element_blank()
 )

# combine
gA = ggplotGrob(a)
gB = ggplotGrob(b)
gC = ggplotGrob(c)
gD = ggplotGrob(d)
gridExtra::grid.arrange(gA, gB, gC, gD, ncol = 1)
@

To draw this example out more fully, the upper panel of Figure \ref{fig:tkr_replication} shows AMCEs for \citeauthor{TeeleKallaRosenbluth2018}'s candidate choice experiment for the full sample of respondents. The second panel shows full sample marginal means. Respondents' preference for female candidates is very apparent in both forms of analysis in the upper two panels because the AMCE definitionally equals the difference in marginal means. But how do Republicans and Democrats differ in their preferences over male and female candidates? The third panel shows conditional AMCEs separately for Democratic and Republican voters, as provided in the original paper and the lower panel shows the results using conditional marginal means for Democratic and Republican voters.\footnote{We opt here for visual presentation of results; tabular presentation of AMCEs, marginal means, and associated standard errors for all examples are included in the Appendix.} By requiring a reference category fixed to zero, the conditional AMCE results in the third panel suggest that there is a very large difference in favorability toward female candidates between Republican and Democratic respondents. In reality, however, the difference in these conditional AMCEs (0.089) reflects the true difference in favorability toward female candidates (difference: 0.045; Democrats: 0.537, Republicans: 0.492) \textit{plus} the difference in favorability toward male candidates (difference: 0.045; Democrats: 0.463, Republicans: 0.508). Because Democrats and Republicans actually differ in their views of profiles containing the reference (male) category, AMCEs sum the true differences in preferences for a given feature level with the difference in preferences toward the reference category.\footnote{Another example that clearly demonstrates the discrepancy between the differences in preferences and the differences in conditional AMCEs can be seen very clearly in the ``political experience'' feature of this experiment (see Appendix \ref{app:tkrpolexperience}).}

<<bechtel_difference_comparison, dependson=c("data_bms"), fig.width=7.5, fig.height=3, fig.cap="True Difference in Favorability and Implied Preference Differences between High and Low Environmentalism Respondents for `Monthly Cost' Feature from Bechtel and Scheve (2013) Climate Agreement Experiment for Each Possible Reference Category">>=
# reported results
main <- cj(bechtel[!is.na(bechtel$environmentalism),], choice_cj ~ cost_cj, id = ~ ID, weights = ~weight, estimate = "amce_diff", by = ~ environmentalism)
main$statistic <- "Originally Reported Difference"
## fill in baseline difference
main[5L, ] <-  main[4L, ]
main[5L, "level"] <- levels(bechtel$cost_cj)[1L]
main[5L, c("estimate", "std.error", "z", "p", "lower", "upper")] <- c(0, rep(NA, 5))

# AMCE differences
amce_diffs <- do.call("rbind", lapply(levels(bechtel$cost_cj), function(x) {
    tmp <- bechtel[!is.na(bechtel$environmentalism),]
    tmp$cost_cj <- relevel(tmp$cost_cj, x)
    cj(tmp, choice_cj ~ cost_cj, id = ~ ID, weights = ~weight, estimate = "amce_diff", by = ~ environmentalism)
}))
amce_diffs$statistic <- "Potential Differences in AMCEs"

# Difference in Marginal Means
mm_diff <- cj(bechtel, choice_cj ~ cost_cj, id = ~ ID, weights = ~weight, estimate = "mm_diff", by = ~ environmentalism)
mm_diff$statistic <- "Difference in MMs"

# Merge
diffs <- rbind(main, amce_diffs, mm_diff)
levels(diffs$statistic) <- c("Originally Reported Difference", "Potential Differences in AMCEs", "Difference in MMs")
# Plot
ggplot(diffs, aes(x = estimate, y = level, group = statistic, colour = statistic, fill = statistic, shape = statistic)) +
  geom_vline(xintercept = 0, colour = "black", linetype = "dashed") +
  geom_point(position = ggstance::position_dodgev(height = 0.25), size=2) +
  geom_errorbarh(aes(xmin = lower, xmax = upper),  
                 size = 0.2, height = 0,
                 position = ggstance::position_dodgev(height = 0.25)) + 
  scale_colour_manual("Statistic", limits = levels(diffs$statistic),
                      values = c("black", "gray", "black"),
                      guide = ggplot2::guide_legend(title = "Statistic")) +
  scale_fill_manual("Statistic",
                    limits = levels(diffs$statistic),
                    values = c("black", "gray", "black"),
                    guide = ggplot2::guide_legend(title = "Statistic")) +
  scale_shape_manual("Statistic",
                     limits = levels(diffs$statistic),
                     values = c(20, 16, 23),
                     guide = ggplot2::guide_legend(title = "Statistic")) +
  scale_x_continuous(limits = c(-0.1,0.1), oob = scales::rescale_none) +
  ylab("") +
  xlab("Estimated Preference Difference between High and Low Environmentalism Respondents") + 
  theme_minimal() + 
      ggplot2::theme(
        legend.position = "bottom",
        panel.grid.major = ggplot2::element_blank(),
        panel.grid.minor = ggplot2::element_blank()
      )
  
rm(main, mm_diff, amce_diffs, diffs)
@



Visual or numerical similarity of subgroup AMCEs is therefore an analytical artefact, not an accurate statement of the similarity of patterns of preferences. We can see this bias in a reanalysis of \citeauthor{BechtelScheve2013}'s four-country climate change agreement experiment. Figure \ref{fig:bechtel_difference_comparison} shows an analysis for the feature capturing the monthly household cost for a potential international climate agreement. This replicates a portion of their results which compare high- and low-environmentalism respondents pooled across countries \citep[13767 figure 4]{BechtelScheve2013}. The original analysis has conditional AMCEs for the two subgroups with 28 Euro per month as the reference category. Conditional AMCEs for both groups are presented as negative with conditional AMCEs for low-environmentalism respondents being more negative than the conditional AMCEs for high-environmentalism respondents at every feature level. This implies positive differences in favorability toward each monthly cost between high- and low-environmentalism respondents. Figure \ref{fig:bechtel_difference_comparison} presents the implied difference-in-AMCEs from the original analysis as black circles, demonstrating the substantial and positive \textit{apparent} differences between the two groups. For example, the difference-in-AMCEs for the 56 Euro per month level (incorrectly) implies that high-environmentalism respondents are \textit{more} favorable toward a 56 Euro per month household cost of an agreement than are low-environmentalism respondents. Yet the opposite is actually true: high environmentalism respondents are less favorable toward this option than low environmentalism respondents. By using the 28 Euro per month level as the reference category, the original analysis implies that preferences are identical between the two groups when in reality high-environmentalism respondents are much less favorable toward a 28 Euro per month cost than low-environmentalism respondents. The black diamonds in Figure \ref{fig:bechtel_difference_comparison} show these true differences in favorability as marginal means for the two groups.

Furthermore, the gray dots in Figure \ref{fig:bechtel_difference_comparison} represent the alternative differences-in-AMCEs that \textit{could have been generated} from alternative choices of reference category using the same data. Not only is it possible for reference categories choice to significantly color the apparent size of differences between subgroup, that choice can also impact the direction and statistical significance of subgroup differences. An analyst could easily choose a reference category that presents differences between these two group as large and positive, small and positive, small and negative, large and negative, or negligible. The original analysis (again, black circles) happens to show large and positive differences between the groups.

It is worth highlighting two further features in Figure \ref{fig:bechtel_difference_comparison}. First, the alternative differences-in-AMCEs estimates vary mechanically around the difference in marginal means, as the reference category varies. The difference between marginal means for two groups are always fixed in the data, so the differencing of subgroup AMCEs is merely an exercise is centering those differences at arbitrary points along the range of observed differences in marginal means. Second, and more practically, because there is no category for which the preferences of the two subgroups in this example are identical, no choice of reference category would have led to inferences from differences-in-AMCEs that accurately reflect the underlying difference in preferences. Even in the 84 Euro per month level, the difference between the two groups is slightly positive. Were there a category for which subgroup preferences were exactly equal, then we could choose that as the reference category and interpret differences-in-AMCEs as differences in preferences. But there is never any guarantee that such a reference category exists. Thus, there is no way to use conditional AMCEs or differences between those conditional AMCEs to convey the underlying similarity or differences in preferences across sample subgroups.


\section*{Improved Subgroup Analyses in Conjoint Designs}\label{sec:marginalmeans}

Researchers and consumers of conjoints interested in describing levels of respondent favorability toward profiles with varying features can avoid the inferential errors that accompany conditional AMCEs by focusing attention on (subgroup) marginal means, differences between subgroup marginal means to infer subgroup differences in preferences toward particular features, and omnibus nested model comparisons to infer subgroup differences across many features. To demonstrate each of these three techniques we provide a complete example based upon \citeauthor{HainmuellerHopkinsYamamoto2014}'s analysis of their immigration conjoint by respondent enthnocentrism, which finds that ``the patterns of support are generally similar for respondents irrespective of their level of ethnocentrism'' \citep[22]{HainmuellerHopkinsYamamoto2014}. First, we show how different reference categories could have led to distinctly different conditional AMCEs and, therefore, interpretations of subgroup preference similarity. Second, we show how differences in marginal means clearly convey the similarity of these two subgroups without any sensitivity to reference category. Finally, we show how tested model comparisons would have provided \citeauthor{HainmuellerHopkinsYamamoto2014} with a statistic test of the claimed similarity in levels of support between these two respondent subgroups.



<<hainmueller_subgroup_example, dependson=c("data_hainmueller_immigration")>>=
# estimates benchmarked to largest difference between subgroups
hainmueller.A <- hainmueller_immigration
    hainmueller.A$Education <- relevel(hainmueller.A$Education, "No formal education")
    hainmueller.A$Gender <- relevel(hainmueller.A$Gender, "male")
    hainmueller.A$CountryOfOrigin <- relevel(hainmueller.A$CountryOfOrigin, "Mexico")
    hainmueller.A$ReasonForApplication <- relevel(hainmueller.A$ReasonForApplication, "Escape political/religious persecution")
    hainmueller.A$Job <- relevel(hainmueller.A$Job, "Doctor")
    hainmueller.A$JobExperience <- relevel(hainmueller.A$JobExperience, "More than five years of job training and experience")
    hainmueller.A$JobPlans <- relevel(hainmueller.A$JobPlans, "Has no plans to look for work at this time")
    hainmueller.A$PriorEntry <- relevel(hainmueller.A$PriorEntry, "Entered the U.S. once before without legal authorization")
    hainmueller.A$LanguageSkills <- relevel(hainmueller.A$LanguageSkills, "spoke [language] and used an interpreter")
# estimates benchmarked to smallest difference between subgroups
hainmueller.B <- hainmueller_immigration
    hainmueller.B$Education <- relevel(hainmueller.B$Education, "Equivalent to completing a college degree")
    hainmueller.B$Gender <- relevel(hainmueller.B$Gender, "female")
    hainmueller.B$CountryOfOrigin <- relevel(hainmueller.B$CountryOfOrigin, "Iraq")
    hainmueller.B$ReasonForApplication <- relevel(hainmueller.B$ReasonForApplication, "Reunite with family members already in the U.S.")
    hainmueller.B$Job <- relevel(hainmueller.B$Job, "Child care provider")
    hainmueller.B$JobExperience <- relevel(hainmueller.B$JobExperience, "No job training or prior experience")
    hainmueller.B$JobPlans <- relevel(hainmueller.B$JobPlans, "Does not have a contract with a U.S. employer")
    hainmueller.B$PriorEntry <- relevel(hainmueller.B$PriorEntry, "Never been to the U.S.")
    hainmueller.B$LanguageSkills <- relevel(hainmueller.B$LanguageSkills, "spoke broken English")
# formula
f1 <- ChosenImmigrant ~ LanguageSkills + CountryOfOrigin + Job + Education + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Gender  + Education:Job + CountryOfOrigin:ReasonForApplication
# estimate
amce_by_ethnocentrism_1 <- cj(hainmueller.A, f1, id = ~ CaseID, estimate = "amce", by = ~ ethnocentrism_split)
amce_by_ethnocentrism_2 <- cj(hainmueller.B, f1, id = ~ CaseID, estimate = "amce", by = ~ ethnocentrism_split)
#amce_by_ethnocentrism_3 <- cj(hainmueller_immigration, f1, id = ~ CaseID, estimate = "amce", by = ~ ethnocentrism_split)
# tag datasets and merge
amce_by_ethnocentrism_1$dataset <- "A"
amce_by_ethnocentrism_2$dataset <- "B"
#amce_by_ethnocentrism_3$dataset <- "C"
amce_ref_example_merged <- rbind(amce_by_ethnocentrism_1, amce_by_ethnocentrism_2)
@

<<hainmueller_subgroup_example_plot, dependson=c("data_hainmueller_immigration", "hainmueller_subgroup_example"), fig.width=8, fig.height=6, fig.cap="Comparison of AMCEs for Low- and High-Ethnocentrism Respondents Using Two Alternative Reference Categories Choices for Three Features from Hainmueller et al.'s (2014) Immigration Experiment", message=FALSE>>=
plot(
  subset(
    amce_ref_example_merged,
    feature %in% c("Education", "CountryOfOrigin", "Job")
  ),
  group = "ethnocentrism_split", 
  feature_headers = FALSE, 
  legend_title = "Ethnocentrism", 
  vline = 0
) + 
  scale_colour_manual(
    "Ethnocentrism", limits = c("high", "low"), values = c("Black", "Gray")
  ) +
  facet_grid(
    rows = vars(feature), 
    cols = vars(dataset), 
    scales = "free_y", 
    space = "free_y", 
    labeller = function(x) label_value(x, multi_line = FALSE)
  ) +
  theme(
    strip.background = element_rect(fill="white", colour="white"),
    strip.text.x = element_text(size = 8),
    strip.text.y = element_blank()
  )
@

To begin, consider the left and right facets of Figure \ref{fig:hainmueller_subgroup_example_plot}, which shows estimated subgroup AMCEs for three features from the immigration study. In panel ``A'' (left), all features are configured so that the reference category is the one with the largest difference in levels of support between the two subgroups thus distorting the size of differences at all other levels. In panel ``B'' (right), all features are configured so that the reference category is the one with the smallest difference in preferences between the two subgroups.

Panel A gives the impression that there are significant differences in preferences between high and low ethnocentrism respondents toward immigrants from different countries of origin, with different careers, and with different educational attainments because the reference category choice cascades the difference in reference category favorability into AMCEs for all other feature levels. By contrast, Panel B gives the impression that these differences are negligible. The experimental data and analytic approach in the two portrayals is identical; the only difference is the choice of reference category. Given what we have shown about the relationship between differences in conditional AMCEs and differences in conditional marginal means, Panel B is a more ``truthful'' visualization, which \citet{Cairo2016} uses to mean avoidance of self-deception in the presentation of data, and a more ``functional'' visualization, by which \citeauthor{Cairo2016} means choosing graphics based on how they will be interpreted by the visualization's consumers. The differences between subgroup AMCEs there more accurately convey differences in underlying preferences because the reference categories used in Panel B are the most similar between the two groups.

<<hainmueller_mm_diffs, dependson=c("data_hainmueller_immigration"), fig.height = 5, fig.width=8, fig.cap="Differences in Conditional Marginal Means, by Ethnocentrism, for Three Features From Hainmueller et al.'s (2014) Immigration Experiment", message=FALSE>>=
mm_by_ethnocentrism <- cj(
  subset(hainmueller_immigration, !is.na(ethnocentrism_split)),
  ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry,
  id = ~ CaseID,
  estimate = "mm_diff",
  by = ~ ethnocentrism_split
)
plot(
  subset(
    mm_by_ethnocentrism,
    feature %in% c("Country of Origin", "Job", "Education")
  ), 
  vline = 0, 
  xlim = c(-0.3,0.3), 
  xlab = "Estimated Difference in Marginal Means"
) +
  scale_colour_manual("feature", values = rep("Black", 9)) +
  ggplot2::theme(legend.position = "none")
@

Next, making a comparison of levels of favorability toward different types of immigrants without using AMCEs would have been even more truthful. Figure \ref{fig:hainmueller_mm_diffs} \textit{directly} shows that comparison of preferences as differences in subgroup marginal means between the two groups for these three features, with 95\% confidence intervals for the difference.\footnote{A presentation of subgroup marginal means for all features can be found in Appendix \ref{app:hainmuellerimmigration}.} The two groups indeed have similar preferences, something that would have happened to be clear had the conditional AMCEs in the right panel of Figure \ref{fig:hainmueller_subgroup_example_plot} been presented but that would have been far less obvious were the conditional AMCEs in the left panel of that figure presented. Pairwise difference in means tests would provide formal procedures for testing the statistical significance of these differences.

Yet, finally, the similarity of subgroup preferences in conjoints is often characterized in an \textit{omnibus} fashion, as in the quote from \citet{HainmuellerHopkinsYamamoto2014} describing ``patterns of support.'' An appropriate test in such cases is one that evaluates whether a model of support that accounts for group differences better fits the data than a model of support with only conjoint features as predictors. This type of test is known as a ``nested model comparison'' which compares the fit of a ``restricted'' regression (the restriction being that interactions between features and a subgroup identifier are held to be zero) nested within an ``unrestricted'' regression that allows for arbitrary interactions between conjoint features and the subgroup identifier. Formally, a nested model comparison provides an F-test of the null hypothesis that all interaction terms are equal to zero.\footnote{Like any ANOVA this hypothesis test may yield substantively different insight from a series of tests of pairwise mean differences. Figure \ref{fig:hainmueller_mm_diffs} shows three instances where the 95\% confidence intervals for pairwise differences in marginal means do not include zero even though the omnibus test fails to reject the null at $\alpha = 0.05$.}

To make this concrete, for a feature with four levels (one treated as a reference category), the first (restricted) equation would be:

\begin{equation}\label{eq:restricted}
Y = \beta_0 + \beta_1 Level_2 + \beta_2 Level_3 + \beta_3 Level_4 + u
\end{equation}

\noindent The second (unrestricted) equation would allow for interactions between feature levels and the subgroup identifier:

\begin{equation}\label{eq:unrestricted}
\begin{split}
Y = & \beta_0 + \beta_1 Level_2 + \beta_2 Level_3 + \beta_3 Level_4 + \beta_4 Group + \\
& \beta_5 Level_2 * Group + \beta_6 Level_3 * Group + \beta_7 Level_4 * Group + u
\end{split}
\end{equation}

\noindent While Equation \ref{eq:restricted} imposes the constraint that $\beta_4 = \beta_5 = \beta_6 = \beta_7 = 0$, Equation \ref{eq:unrestricted} allows for subgroup differences in favorability. Testing this null entails computing an F-statistic comparing the fit of each equation:

\begin{equation}
F = \dfrac{\dfrac{SSR_{Restricted} - SSR_{Unrestricted}}{r}}{\dfrac{SSR_{Unrestricted}}{n - k - 1}}
\end{equation}

\noindent where $SSR_{Restricted}$ is the sum of squared residuals for Equation \ref{eq:restricted}, $SSR_{Unrestricted}$ is the sum of squared residuals for Equation \ref{eq:unrestricted}, where $r$ is the number of restrictions (in the above example, 4), $n$ is the number of cases, and $k$ is the number of feature levels in the unrestricted model.\footnote{Note that this test is not sensitive to reference category even though it requires specifying a regression equation.}

<<hainmueller_anovas, dependson=c("data_hainmueller_immigration"), eval=TRUE, echo=FALSE>>=
hainmueller_anova_education <- cj_anova(subset(hainmueller_immigration, !is.na(ethnocentrism_split)), ChosenImmigrant ~ Education, id = ~ CaseID, by = ~ ethnocentrism_split)
hainmueller_anova_country <- cj_anova(subset(hainmueller_immigration, !is.na(ethnocentrism_split)), ChosenImmigrant ~ CountryOfOrigin, id = ~ CaseID, by = ~ ethnocentrism_split)
hainmueller_anova_job <- cj_anova(subset(hainmueller_immigration, !is.na(ethnocentrism_split)), ChosenImmigrant ~ Job, id = ~ CaseID, by = ~ ethnocentrism_split)
hainmueller_anova_all <- cj_anova(subset(hainmueller_immigration, !is.na(ethnocentrism_split)), ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Education:Job + CountryOfOrigin:ReasonForApplication, id = ~ CaseID, by = ~ ethnocentrism_split)
@

For the education feature, the resulting F-test for the model comparison in this case again gives us little reason to believe there are subgroup differences: \Sexpr{sprintf("F(%d, %d)=%0.2f, $p\\leq%0.2f$", hainmueller_anova_education$Df[2], hainmueller_anova_education[["Resid. Df"]][1], hainmueller_anova_education$F[2], hainmueller_anova_education$"Pr(>F)"[2])}. We could repeat such pairwise comparisons or omnibus comparisons for each feature in the design --- for country of origin (\Sexpr{sprintf("F(%d, %d)=%0.2f, $p\\leq%0.2f$", hainmueller_anova_country$Df[2], hainmueller_anova_country[["Resid. Df"]][1], hainmueller_anova_country$F[2], hainmueller_anova_country$"Pr(>F)"[2])}) or job (\Sexpr{sprintf("F(%d, %d)=%0.2f, $p\\leq%0.2f$", hainmueller_anova_job$Df[2], hainmueller_anova_job[["Resid. Df"]][1], hainmueller_anova_job$F[2], hainmueller_anova_job$"Pr(>F)"[2])}) --- or for all features as a whole (\Sexpr{sprintf("F(%d, %d)=%0.2f, $p\\leq%0.2f$", hainmueller_anova_all$Df[2], hainmueller_anova_all[["Resid. Df"]][1], hainmueller_anova_all$F[2], hainmueller_anova_all$"Pr(>F)"[2])}).

This visual display in Figure \ref{fig:hainmueller_mm_diffs} and these statistical tests make clear what could not be directly inferred from conditional AMCEs alone: there are indeed no sizeable and only a few statistically apparent differences in preferences between the two groups.

<<bechtel_anova, dependson=c("data_bechtel"), eval=TRUE, echo=FALSE>>=
#bechtel_anova_environmentalism <- cj_anova(subset(bechtel, !is.na(environmentalism)), choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, id = ~ ID, by = ~ environmentalism)
#bechtel_anova_reciprocity <- cj_anova(subset(bechtel, !is.na(reciprocity)), choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, id = ~ ID, by = ~ reciprocity)
bechtel_anova_country <- cj_anova(subset(bechtel, !is.na(country)), choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, id = ~ ID, by = ~ country)
bechtel_anova_country_cost <- cj_anova(subset(bechtel, !is.na(country)), choice_cj ~ cost_cj, id = ~ ID, by = ~ country)
@

This kind of nested model comparison test can also be used to assess heterogeneity across conjoint features \citep[see also][]{EgamiImai2018}. For example, \citet{TeeleKallaRosenbluth2018} report just such a test for how effects of features other than candidate sex may differ between male and female candidates, finding no such heterogeneity (8--9). Fortunately, the original analysis accurately detected an absence of subgroup differences, yet a subtly different set of analytic decisions about reference categories (as shown in Figure \ref{fig:hainmueller_subgroup_example_plot}) could have led to quite different inferences. As an example, \citet{BechtelScheve2013} argue that their conjoint results show ``individuals in all four countries [Germany, France, United States, United Kingdom] largely agree on which dimensions are important and to what extent'' \citep[13765]{BechtelScheve2013}, but a nested model comparison shows the countries do differ in their preferences \Sexpr{sprintf("F(%d, %d)=%0.2f, $p\\leq%0.2f$", bechtel_anova_country$Df[2], bechtel_anova_country[["Resid. Df"]][1], bechtel_anova_country$F[2], bechtel_anova_country$"Pr(>F)"[2])}. This cross-country variation is largely driven by differences in sensitivity to monthly household costs feature, \Sexpr{sprintf("F(%d, %d)=%0.2f, $p\\leq%0.2f$", bechtel_anova_country_cost$Df[2], bechtel_anova_country_cost[["Resid. Df"]][1], bechtel_anova_country_cost$F[2], bechtel_anova_country_cost$"Pr(>F)"[2])}, with the United Kingdom and United States being more cost sensitive than Germany and France. Visual comparisons of conditional AMCEs can sometimes provide accurate insights into subgroup differences in preferences (as in the \citeauthor{HainmuellerHopkinsYamamoto2014} case), but ultimately there is no guarantee that they do in any particular analysis.

\section*{Conclusion}\label{sec:conclusion}

This article has identified several challenges related to the analysis and reporting of conjoint experimental designs, particularly analyses of subgroup differences. We suggest that conjoint analyses should report not only average marginal component effects (AMCEs) but also descriptive quantities about levels of favorability that better convey underlying preferences over profile features and better convey subgroup differences in those preferences. Marginal means contain all of the information provided by AMCEs and more. Consequently, our intention here is not to substantively undermine any previous set of results, but instead to urge researchers moving forward to demonstrate considerable caution in how they design, analyze, and present the results of these types of descriptive experiments and how they test for differences in preferences between subgroups.

We have relatively straightforward and hopefully uncontroversial advice for how analysts of conjoint experiments should proceed:

\begin{enumerate}
\item Always report unadjusted marginal means when attempting to provide a \textit{descriptive} summary of respondent preferences in addition to, or instead of, AMCEs.%\footnote{Like the presentation of AMCEs, displaying marginal means in constrained conjoint designs may also distort apparent patterns given that not all features can co-occur. Partitioning the design into fractions such that each fraction contains a fully unconstrained design would mitigate any concern with that presentation.}

\item Exercise caution when explicitly, or implicitly, interpreting differences-in-AMCEs across subgroups. Differences-in-AMCEs are differences in effect sizes for subgroups, not statements about the relative favorability of the subgroups toward profiles with a given feature. Heterogeneous effects do not necessarily mean different underlying preferences. If differences in AMCEs are reported, the choice of reference categories should be discussed explicitly and diagnostics should be provided to justify it.

\item When descriptively characterizing differences in preference level between subgroups, directly estimate the subgroup difference using conditional marginal means and differences between conditional marginal means, rather than relying on the difference-in-AMCEs.

\item To formally test for group differences in preferences, regression with interaction terms between the subgrouping covariate and all feature levels will generate estimates of level-specific differences in preferences via the coefficients on the interaction terms. A nested model comparison between this equation against one without such interactions provides an omnibus test of subgroup differences, which should be reported when characterizing overall patterns of subgroup differences.
\end{enumerate}

\noindent Following this advice, we hope, will allow researchers to more clearly and more accurately represent descriptive results of conjoint experiments.

The popularity of conjoint analyses in recent years highlights the power of the design and the important contributions made by \citet{HainmuellerHopkinsYamamoto2014} in providing a novel causal interpretation of these fully randomized factorial designs. Yet with new tools always come new challenges. The now-common practice of descriptively interpreting conjoints requires more caution than is immediately obvious. To facilitate improved analysis and, especially, to provide easy-to-use tools for calculating marginal means and performing reference category selection diagnostics, we provide software called \textbf{cregg} \citep{Leeper2018cregg} available from the Comprehensive R Archive Network. Additionally, this manuscript is written as a reproducible knitr document \citep{Xie2015} that contains complete code examples that will perform all analyses and visualization used throughout this article. With these resources in-hand, researchers should be well-equipped to analyze subgroup preferences in conjoint designs without running into the analytic challenges discussed here.




\singlespacing
\bibliographystyle{apsa-leeper}
\bibliography{references}
\clearpage


\appendix
\setcounter{page}{1}

\noindent {\large ``Measuring Subgroup Preferences in Conjoint Experiments: Supplemental Information''}

\vspace{0.5em}

\noindent Thomas J. Leeper, Sara B. Hobolt, and James Tilley\\
\noindent 2019-05-24

\renewcommand{\contentsname}{}
\vspace{-2em}
\tableofcontents



\clearpage

\section{Definition of Quantities of Interest}\label{app:quantities}

A conjoint experiment serves two purposes: (1) description of the conditional distribution of favorability over variations in multiple features, and (2) leveraging the random observation of combinations of features (so-called ``profiles'') to infer that any differences in favorability over features are causally attributable to the features as opposed to something else. The quantities of interest are therefore functions of the features being randomized as in any factorial experiment. But additionally, conjoints typically involve within-subjects research designs (i.e., multiple, different profile observations per participant) thus necessitating some additional notation to account for the \textit{survey implementation} of the conjoint in addition to the definition of the descriptive and causal parameters of interest.

Ultimately, a conjoint since \citet{HainmuellerHopkinsYamamoto2014} is a complex survey-experimental design involving multiple observations across a high-dimension factorial experimental space. Specifically, $I$ respondents ($i \in \{1, \dots, I\}$) are presented with $K$ rating or forced choice decision tasks, each involving $J$ (typically 2) alternative profiles of, for example, candidates or policies. Each profile consists of a vector of $F$ (typically discrete) features or attributes that describe the profile (e.g., age, sex), each composed of $D_f$ alternative levels, a number which can vary across features. The experiment thus generates a dataset with $N = I \times J \times K$ observations of some rating scale or discrete choice outcome, $Y$, from a random sample of profiles drawn from the $C = \prod_{f = 1}^{F} D_f$ population of experimental \textit{cells} in the $F$-dimension feature space.

The survey implementation of the conjoint therefore generates $N$ observations that can be indexed by $i,j,k$, forming an $N \times(L+4)$ dimensional data matrix $\mathbf{M}$ with each row representing the vector of feature levels $\vec{F}$ in each profile $j$ of respondent $i$'s task $k$, with indicators for $i$, $j$, $k$, and the corresponding outcome $Y_{i,j,k}$.\footnote{In typical paired designs (where $J=2$), this means each task generates two data points: $Y_{i,1,k}$ and $Y_{i,2,k}$. Note, too, that in fully randomized designs, these two profiles can be identical. Furthermore in fully randomized, forced-choice designs this can yield the additional curiosity that $Y_{i,\mathbf{c}} \neq Y_{i,\mathbf{c}}$ for a given respondent, $i$, and profile, $\mathbf{c}$.}

With no loss of information, we can think of each row in this matrix equivalently as an observation of $Y_{i,\vec{F}}$. This is because \citet{HainmuellerHopkinsYamamoto2014} make several important assumptions that allow us to interpret these data in a different way than the survey implementation implies. First, they assume no carryover effects (Assumption 1), such that multiple observations from the same respondent can be treated as independent of one another. Second, they assume no profile order effects within-task (Assumption 2), such that profiles within a task can be treated as independent of each other. Assumptions 1 and 2 imply that the survey implementation indices for task, $k$, and profile-within-task, $j$, can be ignored. They have no bearing on any quantity of interest, by assumption. 

The analyst is therefore left with a dataset of $N$ observations, grouped into $i$ participants, each providing into $Y_{\vec{F}}$. All quantities of interest must therefore be specified over as features of the distribution of $Y$ over the $F$-dimensional feature space. In what follows, we therefore focus on the experimental features being randomized rather than the survey design factors being assumed away. \citet{HainmuellerHopkinsYamamoto2014} make a third assumption that profiles are randomly constituted (Assumption 3), which in a fully randomized design, has the effect of meaning that features and feature combinations are randomly sampled for observation. If this randomization is uniform (which it almost always is in applied examples) this means we can additionally ignore the probability of observing any given combination (as all profiles are equally likely to be observed). This is a point we return to in a moment.

The most basic thing that can be estimated about the distribution of $Y$ is the expected value, $E[Y]$, or \textit{grand mean} (in the parlance of factorial experiments). We can think of this quantity in terms of the survey implementation process (namely, respondents, tasks, and profiles) or as a simple function of the resulting data:

\begin{equation}
\bar{Y} = \dfrac{1}{I \times J \times K} \sum_{i=1}^{I} \sum_{j=1}^{J} \sum_{k=1}^{K} Y_{i,j,k} = \dfrac{1}{N} \sum_{n=1}^{N} Y_n
\end{equation}

\noindent The nested summation over $i,j,k$ could be stated explicitly but is unnecessary as the grand mean is simply the mean of all observed $Y$. A useful check on intuition is that in a forced choice design, where a respondent must choose only one profile, $j$, of all those presented in each task $k$, then by design $\bar{Y} = \frac{1}{J}$. For common, two-alternative, forced choice designs, $\bar{Y}$ therefore always equals $0.5$. By contrast, in rating scale designs, $\bar{Y}$ can take any value between the lower and upper bounds of the rating scale.

In a \textit{full factorial} experiment where $N > C$ (the number of observations is larger than the number of cells) due to a large sample, or few factors, or levels of each factor, or both (or both of these design characteristics), a sensible next quantity of interest is the \textit{cell mean}: $E[Y|\vec{X} = \vec{x}]$, which in a conjoint simply measures the mean favorability toward a particular profile, $\vec{x}$. An effort to actually estimate this quantity will, however, become obviously intractable when one recognizes that the number of observations in a typical conjoint is much lower the number of feasible profiles ($N \ll C$). The cell mean can be unobserved for many or perhaps most experimental cells. 

Therefore quantities of interest that derive from it --- such as pairwise differences of means between cells --- cannot be estimated for any of the arbitrary $\binom{C}{2}$ pairs of cells. As an example, in the \cite{HainmuellerHopkinsYamamoto2014} candidate experiment, $C = 6^6 * 2^2 = 186,624$ and $N = 3466$, so less than 2\% of experimental cells were observable and a minuscule fraction of the 17.4 billion pairwise cell combinations could have generated estimable effects. 

It is at this point that the quantities of interest in a conjoint can become confusing. In a typical experiment where $N > C$, these pairwise differences of means are the standard estimator for a causal effect (the estimand being the causal effect on favorability of changing from one profile to another). For example, we might be interested in the effect on $Y$ of changing the value of one feature to another theoretically interesting value of that feature, holding all other feature values in the profile constant:

\begin{equation}
\tau = E[Y|X_1=x_1,X_2=x_2,\dots,X_f=x_f] - E[Y|X_1= \neg x_1,X_2=x_2,\dots,X_f=x_f]
\end{equation}

\noindent but we have no guarantee that both or, in fact, either of those particular cells are observed. If even this minimal causal quantity cannot be guaranteed to be estimable by design, questions about higher-order interactions across features are even more difficult to estimate as they require observing four or more specific cells, any of which may be missing. Even if we were interested in such quantities, we would be unlikely to be able to estimate them.

Conjoint designs therefore ask us to think about completely different quantities of interest from typical sentiment measurement or experimentation. Consequently, what quantities might we care about that can be estimated from an $L$-dimension factorial experimental with considerable sparsity other than the grand mean? 

Even though $N \ll C$ in most applied conjoints, $N > F$. This means that even if we probably cannot learn about particular high-dimensional \textit{combinations of features}, we can learn about favorability toward particular features alone. That is, we can learn about conditional expectations over each feature dimension, $E[Y|X_f=x_f]$. In the factorial experiments literature, this conditional mean is called the \textit{marginal mean} (as it lies at the margins of a tabular presentation cell means for the complete design). For example, the following 2x3 factorial design contains 6 cell means ($2*3$), 1 grand mean, and five marginal means ($2+3$, one for each level of each factor):

\begin{center}
\begin{tabular}{lccc}\toprule
 & $A = 1$ & $A = 2$ & \\ \midrule
$B = 1$ & $\bar{Y}_{A=1, B=1}$ & $\bar{Y}_{A=2, B=1}$ & $E[Y|B=1]$ \\
$B = 2$ & $\bar{Y}_{A=1, B=2}$ & $\bar{Y}_{A=2, B=2}$ & $E[Y|B=2]$ \\
$B = 3$ & $\bar{Y}_{A=1, B=3}$ & $\bar{Y}_{A=2, B=3}$ & $E[Y|B=3]$ \\ \midrule
 & $E[Y|A=1]$ & $E[Y|A=2]$ & $E[Y]$ \\ \bottomrule
\end{tabular}
\end{center}

The uniform sampling of cells in the design means that this is quantity can be estimated by the simple mean of $Y \forall X_f = x_f$.\footnote{In unbalanced designs where the probability of being in a given cell is not uniform across cells, there is sometimes a distinction made between \textit{descriptive} marginal means that equally weight observations and \textit{design} marginal means that equally weight cells in the design. Given conjoint designs generally do not allow for the observation of cell means, the distinction is not relevant and we refer to \textit{descriptive} marginal means simply as ``marginal means.''}

Were a constrained conjoint design used where some feature combinations were impossible, the marginal means would only be intelligible in the fractions of the design where all cells are observed.\footnote{Practically, the random sampling of cells does not need to be uniform; over- and under-representation of cells is possible. We focus here on fully randomized designs that draw profiles from the full space with equal probability. A nuance in \citeauthor{HainmuellerHopkinsYamamoto2014}'s notation is that their quantities of interest are conditioned on an arbitrary joint distribution of features rather than the particular joint distribution of features that was used to construct design or the joint distribution of features that happens to emerge empirically. In other words, they weight cells by an arbitrary joint probability mass function.}

To clarify this point, consider the constrained 2x3 design below where one cell is unobserved by design:

\begin{center}
\begin{tabular}{lccc}\toprule
 & $A = 1$ & $A = 2$ & \\ \midrule
$B = 1$ & $\bar{Y}_{A=1, B=1}$ & $\bar{Y}_{A=2, B=1}$ & $E[Y|B=1]$ \\
$B = 2$ & $\bar{Y}_{A=1, B=2}$ & $\bar{Y}_{A=2, B=2}$ & $E[Y|B=2]$ \\
$B = 3$ & $\bar{Y}_{A=1, B=3}$ & -- & $E[Y|B=3]$ \\ \midrule
 & $E[Y|A=1]$ & $E[Y|A=2]$ & $E[Y]$ \\ \bottomrule
\end{tabular}
\end{center}

\noindent Were the lower-right cell ($A=2, B=3$) observable by design, then a direct comparison of the marginal means, $E[Y|A=1]$ and $E[Y|A=2]$, in the lower table margin would provide direct insight into the relative favorability of respondents to profiles with features $A=1$ and $A=2$, marginalized over $B$. But because this cell is unobserved, these marginal means marginalize over different subsets of the possible values of $B$ making them not obviously comparable. By contrast, the first and second marginal means at the top-right of the table --- $E[Y|B=1]$ and $E[Y|B=2]$ --- provide insight into the favorability of participants toward profiles with features $B=1$ and with feature $B=2$ marginalizing over the two possible values of $A$. A researcher could safely conclude that participants are more (less) favorable toward profiles with feature $B=1$ than $B=2$ from this information alone. But they would not be able to so for feature $A$ without either (a) an explicit caveat that the comparison is of dissimilar subsets of profiles along dimension $B$ or (b) calculating marginal means over only the completely observable\footnote{Note that what matters here is \textit{observability}, not whether any given cell is actually observed. We know from above that most cells will be unobserved even in a uniformly sampled, unconstrained design.} portion of the feature space due to the curse of dimensionality.

For the common \textit{descriptive} use of conjoint designs to measure preferences over multi-dimensional objects, these marginal means alone are of direct interest. They express favorability on the scale of the outcome over alternative values of each feature independent of the features in the design.\footnote{They do not necessarily convey favorability in an absolute sense. A high marginal mean for a given feature does not imply that the sample prefers that feature in an absolute sense. Instead, favorability has to be understood in light of the features presented to respondents. This is the innovation in conjoints; rather than asking respondents whether they will support a Mormon candidate (for example), we can infer their favorability toward a Mormon candidate in light of other candidate characteristics they may consider. Still our design may not contain all such features, so caution is needed in drawing typical public opinion inferences from these marginal means.}

For the \textit{causal} interpretation of conjoint designs, comparisons of these marginal means is required. Comparisons between them provide causal inferences about the effect of changing a focal feature, marginalizing across the distribution of other features. Because feature combinations (i.e., the profiles) are randomly constructed and randomly observed from all possible combinations, the distribution of other non-focal features is, in expectation, is independent of the focal feature, thus identical across all levels of the focal feature, and therefore ignorable.

A typical causal effect of interest is therefore the difference in marginal means across two levels of a feature (i.e., the marginal effect of a change in a feature's levels). For an unconstrained design, this difference is the \textit{average marginal component effect} (AMCE) defined by \citet{HainmuellerHopkinsYamamoto2014}. In this way, an AMCE is simply a marginal effect of the factorial design: the difference of two marginal means.

Unfortunately, this is not a perfectly complete definition, but it covers the vast majority of applied cases. The exceptions are few. First, \citeauthor{HainmuellerHopkinsYamamoto2014} allow the joint distribution of features used in calculating the difference of marginal means to be arbitrary. This is meant to accommodate the weighting of marginal means to reflect the real-world distribution of feature combinations (e.g., down-weighting African American Republican political candidates given their rarity in real-world politics). Their definition of an AMCE allows for arbitrary weighting, but in practice this is uncommon.

Second, in constrained designs where some cells are unobservable, care needs to be taken in both defining and estimating AMCEs. Take, for example, the trivial example just above. The difference $E[Y|B=2]-E[Y|B=1]$ marginalizes over the full set of levels of $A$ but $E[Y|B=3]-E[Y|B=1]$ marginalizes only over case where $A=1$. Thus these two marginal effects reflect different subsets of the data and different combinations of values of $A$.

\citeauthor{HainmuellerHopkinsYamamoto2014} allow for these two differences to be presented as the AMCE despite the fact that the quantities marginalize over distinct subsets of the design. Indeed, their definition of AMCE for constrained designs diverges from the intuitive marginal effect to instead define the AMCEs for levels of $B$ as an average of marginal effects of $B$ over subsets of $A$ and the AMCEs for levels of $A$ as averages of the marginal effects of $A$ over subsets of $B$ (again, weighting these marginal effects arbitrarily). For example, if feature $A$ is race ${Caucasian, African American}$ and feature $B$ is religion ${Evangelical, Catholic, Jewish}$. In \citeauthor{HainmuellerHopkinsYamamoto2014}'s notation, the AMCE of a candidate being Jewish relative to being Evangelical Christian is defined only for Caucasian candidates, while the AMCE of being Catholic is defined for both African American and Caucasian candidates. They present these subset marginal effects as the sample AMCEs even though they are not defined for the whole sample. There is nothing inherently problematic about that but, as noted earlier, it requires either being clear about what features are being marginalized over for each AMCE or an analysis of only the complete and comparable subset of the design (i.e., partitioning the design to form two complete, overlapping experimental designs). So, the researcher in this example may prefer to not present the AMCE of being Jewish together with the other results as it does not draw upon the complete set of feature combinations used in other portions of the analysis.


\clearpage

\section{Impact of Reference Category Choice on AMCEs}\label{app:referencecategory}

Though seemingly arbitrary, the choice of reference category for estimating AMCEs can be quite consequential. For example, in \citeauthor{HainmuellerHopkinsYamamoto2014}'s candidate experiment (again, see \ref{fig:hainmueller_candidate_replication}), the least liked education level (``no formal education'') is chosen as a reference category, but the authors could have presented the results using any of the categories as the baseline.

The figure below shows how the estimated AMCEs for each level of the education feature would have differed depending on that choice. Selecting a reference category that receives middling support (i.e., more favorability than some other feature levels but less favorability than others), makes some AMCEs positive and others negative but all AMCEs can be made positive (or negative) simply by choosing a different baseline. The results would be numerically equivalent --- the alternative linear models used to the estimate the AMCEs have a mathematical equivalence --- but the choice has sizeable consequences for the interpretation of conjoint analyses, as we discuss below.

<<reference_category, dependson=c("data_hainmueller_immigration"), fig.width=8, fig.height=4, fig.cap="", fig.pos="h">>=
tmp <- amce_by_reference(hainmueller_immigration, ChosenImmigrant ~ Education, ~ Education, id = ~CaseID)
ggplot(tmp, aes(x = estimate, y = level, group = BY, colour = BY)) +
  geom_vline(xintercept = 0, colour = "gray") +
  geom_point(position = ggstance::position_dodgev(height = 1)) +
  geom_errorbarh(aes(xmin = lower, xmax = upper),  
                 size = 0.2, height = 0,
                 position = ggstance::position_dodgev(height = 1)) + 
  scale_colour_discrete(
    guide = ggplot2::guide_legend(
      title = "Reference Category",
      title.position = "top",
      ncol = 2
    )
  ) +
  scale_x_continuous(limits = c(-.3,.3), oob = scales::rescale_none) +
  ylab("") +
  xlab("Estimated AMCE") + 
  theme_minimal() + 
  ggplot2::theme(
    legend.position = "bottom",
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank()
  )
rm(tmp)
@


In \textit{constrained} conjoint designs, the choice of reference category is even more important. Consider, for example, the design of \citeauthor{HainmuellerHopkinsYamamoto2014}'s immigration experiment, which constrains the ``Country of Origin`` feature so that levels `India,' `Germany,' `France,' `Mexico,' `Philippines,' and `Poland' cannot co-occur with the `Escape Persecution' level of the ``Reason for Application'' feature. Consequently, the AMCE for the ``Escape Persecution'' level (relative to the  ``Reunite with family'' reference category) is only defined for the subset of the design involving countries `China,' `Sudan,' `Somalia,' and `Iraq.' The AMCEs for those four countries (relative to India as a baseline) marginalize across all reasons for application, but the AMCEs for the first six countries marginalize only across the latter two reasons. Thus the interpretation of AMCEs --- and the basic ability to estimate them in constrained designs --- depends entirely upon the selection of a reference category where all feature levels can co-occur. In a design where \textit{all} features are constrained, then AMCEs are undefined for the design as a whole and only estimable for subsets of the design that are \textit{conditionally} unconstrained.



\clearpage

\section{Re-analysis of `Political Experience' Feature from Teele et al. (2018)}\label{app:tkrpolexperience}

<<tkr_replication_experience, dependson=c("data_tkr"), fig.height=6, fig.width=7, fig.show="hold", fig.pos="h", message=FALSE>>=
a <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_experience,
   id = ~ responseid,
   estimate = "amce",
  ), feature == "Political Experience"), 
  xlab = NULL, feature_headers = FALSE, vline = 0, xlim = c(-0.21, 0.21), size = 2
) + 
  ggtitle(NULL, "Full Sample AMCEs") + 
  scale_colour_manual("Feature", limits = "Political Experience", values = "black") +
  theme(
  	legend.position = "none",
  	panel.border = element_blank(),
  	axis.line.x = element_blank(),
	axis.line.y = element_blank()
  )

b <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
   id = ~ responseid,
   estimate = "mm",
  ), feature == "Political Experience"),
  xlab = NULL, feature_headers = FALSE, vline = 0.5, xlim = c(0.29,0.71), size = 2
) + 
  ggtitle(NULL, "Full Sample MMs") + 
  scale_colour_manual("Feature", limits = "Political Experience", values = "black") +
  theme(
   	legend.position = "none",
   	panel.border = element_blank(),
   	axis.line.x = element_blank(),
	axis.line.y = element_blank()
  )

c <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
   id = ~ responseid,
   estimate = "amce",
   by = ~ PartyID
  ), feature == "Political Experience"), 
  xlab = NULL, feature_headers = FALSE, vline = 0, group = "PartyID", xlim = c(-0.21, 0.21), size = 2
) + 
 annotate("text", y = 1.65, x = -0.04, label = "Republicans", size = 3) + 
 annotate("text", y = 1.3, x = 0.05, label = "Democrats", size = 3) + 
 ggtitle(NULL, "Subgroup AMCEs") + 
 scale_colour_manual(values=rep("black", 2)) +
 aes(shape = PartyID) + 
 scale_shape_manual("PartyID", limits = c("Democrat", "Republican"), values = c(15,17)) +
 theme(
   legend.position = c(0.1,0.8),
   legend.title=element_blank(),
   panel.border = element_blank(),
   axis.line.x = element_blank(),
   axis.line.y = element_blank()
 )

d <- plot(
  subset(cj(
   tkr, 
   winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
   id = ~ responseid,
   estimate = "mm",
   by = ~ PartyID
  ), feature == "Political Experience"),
  xlab = NULL, feature_headers = FALSE, vline = 0.5, group = "PartyID", xlim = c(0.29,0.71), size = 2
) + 
 ggtitle(NULL, "Subgroup MMs") + 
 scale_colour_manual(values=rep("black", 2)) +
 aes(shape = PartyID) + 
 scale_shape_manual("PartyID", limits = c("Democrat", "Republican"), values = c(15,17)) +
 theme(
  legend.position = c(0.1,0.8),
  legend.title=element_blank(),
  panel.border = element_blank(),
  axis.line.x = element_blank(),
  axis.line.y = element_blank()
 )

# combine
gA = ggplotGrob(a)
gB = ggplotGrob(b)
gC = ggplotGrob(c)
gD = ggplotGrob(d)
gridExtra::grid.arrange(gA, gB, gC, gD, ncol = 1)
@

\noindent Conditional AMCEs in this experiment (see 3rd panel, above) correctly convey that both Democrats and Republicans are more likely to favor experienced than inexperienced candidates. Reading the AMCEs descriptively, however, would suggest that Democratic voters are more favorable toward candidates with all levels of experience compared to Republican voters (i.e., Republicans and Democrats differ in their preferences over experienced candidates). Yet the conditional marginal means (4th panel, above) reveal that Democrats and Republicans have very similar preferences toward candidates with 1 or 3 years of experience, but differ dramatically in their preferences over candidates with no experience (the reference category) and those with 8 years experience. Democrats are much more sensitive to experience than are Republicans and important differences in preferences between the groups are apparent for very high and very low experience, but the conditional AMCEs suggest that preferences differ at all levels of experience, when in reality they do not.

\clearpage 

\section{Hainmueller et al. (2014) Candidate Experiment}\label{app:hainmuellercandidate}

\subsection{Replication using AMCEs and MMs}

<<hainmueller_candidate_amce_appendix, dependson=c("data_hainmueller_candidate"), fig.height=8, fig.width=8>>=
p1 <- plot(
  cregg::cj(
   hainmueller_candidate, 
   selected ~ atmilitary + atreligion + ated + atprof + atinc + atrace + atage + atmale,
   id = ~ resID,
   estimate = "amce"
  ), vline = 0
) + 
  ggplot2::scale_x_continuous(
    limits = c(-0.4, 0.4), 
    breaks = c(-0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4)
  ) + 
  ggplot2::geom_text(
    aes(label = ifelse(is.na(std.error), "", sprintf("%0.2f (%0.2f)", estimate, std.error))),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
 ) + 
 theme(legend.position="none")
p2 <- plot(
  cregg::cj(
   hainmueller_candidate, 
   selected ~ atmilitary + atreligion + ated + atprof + atinc + atrace + atage + atmale,
   id = ~ resID,
   estimate = "mm"
  ), vline = 0.5
) + 
  ggplot2::scale_x_continuous(
    limits = c(0.1, 0.9), 
    breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
  ) + 
  ggplot2::geom_text(
    aes(label = sprintf("%0.2f (%0.2f)", estimate, std.error)),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
  ) + 
  theme(legend.position="none")
gridExtra::grid.arrange(p1, p2, ncol=2)
@

\clearpage

\subsection{Numerical Results: AMCEs}

<<hainmueller_candidate_amce_table, dependson=c("data_hainmueller_candidate"), results="asis">>=
print(xtable::xtable(
  cj(
     hainmueller_candidate, 
     selected ~ atmilitary + atreligion + ated + atprof + atinc + atrace + atage + atmale,
     id = ~ resID,
     estimate = "amce"
  )[c("feature", "level", "estimate", "std.error", "z")],
    digits = 2, align = c("l", "l", "p{3in}", "r", "r", "r")
), include.rownames = FALSE, size = "footnotesize")
@

\clearpage

\subsection{Numerical Results: MMs}

<<hainmueller_candidate_mm_table, dependson=c("data_hainmueller_candidate"), results="asis">>=
print(xtable::xtable(
  cj(
     hainmueller_candidate, 
     selected ~ atmilitary + atreligion + ated + atprof + atinc + atrace + atage + atmale,
     id = ~ resID,
     estimate = "mm",
     h0 = 0.5
  )[c("feature", "level", "estimate", "std.error", "z")],
    digits = 2, align = c("l", "l", "p{3in}", "r", "r", "r")
), include.rownames = FALSE, size = "footnotesize")
@

\clearpage

\section{Hainmueller et al. (2014) Immigration Experiment}\label{app:hainmuellerimmigration}

\subsection{Replication using AMCEs}

<<hainmueller_immigration_amce_appendix, dependson=c("data_hainmueller_immigration"), fig.height=8, fig.width=8>>=
p1 <- plot(
  cregg::cj(
     hainmueller_immigration, 
     ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Education:Job + CountryOfOrigin:ReasonForApplication,
     id = ~ CaseID
  ), vline = 0
) + 
  ggplot2::scale_x_continuous(
    limits = c(-0.4, 0.4), 
    breaks = c(-0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4)
  ) + 
  ggplot2::geom_text(
    aes(label = ifelse(is.na(std.error), "", sprintf("%0.2f (%0.2f)", estimate, std.error))),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
 ) + 
 theme(legend.position="none")
p1
@

\clearpage

\subsection{Numerical Results: AMCEs}

<<hainmueller_amce_table, dependson=c("data_hainmueller_immigration"), results="asis">>=
print(xtable::xtable(
  cj(
     hainmueller_immigration, 
     ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Education:Job + CountryOfOrigin:ReasonForApplication,
     estimate = "amce",
     id = ~ CaseID
  )[c("feature", "level", "estimate", "std.error", "z")],
    digits = 2, align = c("l", "l", "p{3in}", "r", "r", "r")
), include.rownames = FALSE, size = "scriptsize")
@


\clearpage

\subsection{Replication using MMs}


<<hainmueller_immigration_mm_appendix, dependson=c("data_hainmueller_immigration"), fig.height=8, fig.width=8>>=
p2 <- plot(
  cregg::cj(
     hainmueller_immigration, 
     ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry,
     id = ~ CaseID,
     estimate = "mm"
  ), vline = 0.5
) + 
  ggplot2::scale_x_continuous(
    limits = c(0.1, 0.9), 
    breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
  ) + 
  ggplot2::geom_text(
    aes(label = sprintf("%0.2f (%0.2f)", estimate, std.error)),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
  ) + 
  theme(legend.position="none")
p2
@

\clearpage

\subsection{Numerical Results: MMs}

<<hainmueller_mm_table, dependson=c("data_hainmueller_immigration"), results="asis">>=
print(xtable::xtable(
  cj(
     hainmueller_immigration, 
     ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Education:Job + CountryOfOrigin:ReasonForApplication,
     estimate = "mm",
     id = ~ CaseID,
     h0 = 0.5
  )[c("feature", "level", "estimate", "std.error", "z")],
    digits = 2, align = c("l", "l", "p{3in}", "r", "r", "r")
), include.rownames = FALSE, size = "scriptsize")
@

\clearpage

\subsection{Subgroup Analysis using AMCEs}

<<hainmueller_immigration_subgroup_amce_appendix, dependson=c("data_hainmueller_immigration"), fig.height=8, fig.width=8>>=
plot(
  cj(
   na.omit(hainmueller_immigration), 
   ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Education:Job + CountryOfOrigin:ReasonForApplication,
   id = ~ CaseID,
   by = ~ethnocentrism_split
  ), group = "ethnocentrism_split"
)
@

\clearpage

\subsection{Subgroup Analysis using MMs}

<<hainmueller_immigration_subgroup_mm_appendix, dependson=c("data_hainmueller_immigration"), fig.height=8, fig.width=8>>=
plot(
  cj(
   subset(hainmueller_immigration, !is.na(ethnocentrism_split)), 
   ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Education:Job + CountryOfOrigin:ReasonForApplication,
   id = ~ CaseID,
   estimate = "mm",
   h0 = 0.5,
   by = ~ ethnocentrism_split
  ), vline = 0.5, group = "ethnocentrism_split"
)
@

\clearpage

\subsection{Nested Model Comparison}

<<hainmueller_immigration_anova, dependson=c("data_hainmueller_immigration")>>=
cj_anova(subset(hainmueller_immigration, !is.na(ethnocentrism_split)), 
   ChosenImmigrant ~ Gender + Education + LanguageSkills + CountryOfOrigin + Job + JobExperience + JobPlans + ReasonForApplication + PriorEntry + Education:Job + CountryOfOrigin:ReasonForApplication,
   id = ~ CaseID, by = ~ ethnocentrism_split)
@

\clearpage


\section{Teele et al. (2018) Candidate Experiment}\label{app:tkr}

\subsection{Replication using AMCEs and MMs}

<<tkr_amce_appendix, dependson=c("data_tkr"), fig.height=5, fig.width=8>>=
p1 <- plot(
  cregg::cj(
     tkr, 
     winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
     id = ~ responseid,
     estimate = "amce"
  ), vline = 0
) + 
  ggplot2::scale_x_continuous(
    limits = c(-0.3, 0.3), 
    breaks = c(-0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3)
  ) + 
  ggplot2::geom_text(
    aes(label = ifelse(is.na(std.error), "", sprintf("%0.2f (%0.2f)", estimate, std.error))),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
 ) + 
 theme(legend.position="none")
p2 <- plot(
  cregg::cj(
     tkr, 
     winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
     id = ~ responseid,
     estimate = "mm",
     h0 = 0.5
  ), vline = 0.5
) + 
  ggplot2::scale_x_continuous(
    limits = c(0.2, 0.8), 
    breaks = c(0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8)
  ) + 
  ggplot2::geom_text(
    aes(label = sprintf("%0.2f (%0.2f)", estimate, std.error)),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
  ) + 
  theme(legend.position="none")
gridExtra::grid.arrange(p1, p2, ncol=2)
@

\clearpage

\subsection{Numerical Results: AMCEs}

<<tkr_amce_table, dependson=c("data_tkr"), results="asis">>=
print(xtable::xtable(
  cj(
   tkr, 
   winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
   id = ~ responseid,
   estimate = "amce"
  )[c("feature", "level", "estimate", "std.error", "z")],
  digits = 2, align = c("l", "l", "p{1.5in}", "r", "r", "r")
), include.rownames = FALSE)
@

\clearpage

\subsection{Numerical Results: MMs}

<<tkr_mm_table, dependson=c("data_tkr"), results="asis">>=
print(xtable::xtable(
  cj(
   tkr, 
   winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
   id = ~ responseid,
   estimate = "mm",
   h0 = 0.5
  )[c("feature", "level", "estimate", "std.error", "z")],
  digits = 2, align = c("l", "l", "p{1.5in}", "r", "r", "r")
), include.rownames = FALSE)
@

\clearpage

\subsection{Subgroup Analysis using AMCEs}

<<tkr_subgroup_amce_appendix, dependson=c("data_tkr"), fig.height=5, fig.width=8>>=
plot(
  cj(
   subset(tkr, !is.na(PartyID)), 
   winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
   id = ~ responseid,
   estimate = "amce",
   by = ~ PartyID
  ), group = "PartyID"
)
@

\subsection{Subgroup Analysis using MMs}

<<tkr_subgroup_mm_appendix, dependson=c("data_tkr"), fig.height=5, fig.width=8>>=
plot(
  cj(
   subset(tkr, !is.na(PartyID)), 
   winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age,
   id = ~ responseid,
   estimate = "mm",
   by = ~ PartyID,
   h0 = 0.5
  ), vline = 0.5, group = "PartyID"
)
@

\clearpage

\subsection{Nested Model Comparison: Male/Female Voters}


<<tkr_subgroup_anova_party_sex, dependson=c("data_tkr")>>=
cj_anova(subset(tkr, !is.na(Sex)), winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age, id = ~ responseid, by = ~ Sex)
@

\subsection{Nested Model Comparison: Democratic/Republican Voters}

<<tkr_subgroup_anova_party, dependson=c("data_tkr")>>=
cj_anova(subset(tkr, !is.na(PartyID)), winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age, id = ~ responseid, by = ~ PartyID)
@


\clearpage


\subsection{Comparison of Alternative Reference Categories}

<<tkr_subgroup_example, dependson=c("data_tkr")>>=
# estimates benchmarked to largest difference between subgroups
tkr.A <- tkr
    tkr.A$feature_sex <- relevel(tkr.A$feature_sex, "Male")
    tkr.A$feature_experience <- relevel(tkr.A$feature_experience, "None")
    tkr.A$feature_marital <- relevel(tkr.A$feature_marital, "Doctor Spouse")
    tkr.A$feature_job <- relevel(tkr.A$feature_job, "Corporate Lawyer")
    tkr.A$feature_children <- relevel(tkr.A$feature_children, "1 child")
    tkr.A$feature_age <- relevel(tkr.A$feature_age, "65")
# estimates benchmarked to smallest difference between subgroups
tkr.B <- tkr
    tkr.B$feature_sex <- relevel(tkr.B$feature_sex, "Male")
    tkr.B$feature_experience <- relevel(tkr.B$feature_experience, "1 year")
    tkr.B$feature_marital <- relevel(tkr.B$feature_marital, "Farmer Spouse")
    tkr.B$feature_job <- relevel(tkr.B$feature_job, "Mayor")
    tkr.B$feature_children <- relevel(tkr.B$feature_children, "No children")
    tkr.B$feature_age <- relevel(tkr.B$feature_age, "45")
# formula
f1 <- winner ~ feature_sex + feature_experience + feature_marital + feature_job + feature_children + feature_age
# estimate
amce_by_pid_1 <- cj(subset(tkr.A, !is.na(PartyID)), f1, id = ~ responseid, estimate = "amce", by = ~ PartyID)
amce_by_pid_2 <- cj(subset(tkr.B, !is.na(PartyID)), f1, id = ~ responseid, estimate = "amce", by = ~ PartyID)
# tag datasets and merge
amce_by_pid_1$dataset <- "A"
amce_by_pid_2$dataset <- "B"
amce_ref_tkr_merged <- rbind(amce_by_pid_1, amce_by_pid_2)
amce_ref_tkr_merged$level <- factor(amce_ref_tkr_merged$level, levels = with(tkr, c(levels(feature_sex), levels(feature_experience), levels(feature_marital), levels(feature_job), levels(feature_children), levels(feature_age))))
@


<<tkr_subgroup_example_plot, dependson=c("data_tkr", "tkr_subgroup_example"), fig.width=10, fig.height=10, fig.caption="Alternative Reference Categories Features in Teele et al. (2018) Candidate Experiment">>=
plot(
  amce_ref_tkr_merged, 
  group = "PartyID", 
  feature_headers = FALSE, 
  legend_title = "Party Identification", 
  vline = 0
) + 
  facet_grid(
    rows = vars(feature), 
    cols = vars(dataset), 
    scales = "free_y", 
    space = "free_y", 
    labeller = function(x) label_value(x, multi_line = FALSE)
  ) +
  theme(
      strip.background = element_rect(fill="white", colour="white"),
      strip.text.x = element_text(size = 8),
      strip.text.y = element_blank()
    )
@


\clearpage

\section{Bechtel and Scheve (2013) Climate Agreement Experiment}\label{app:bechtel}

\subsection{Replication using AMCEs and MMs}

<<bechtel_amce, dependson=c("data_bechtel"), fig.height=8, fig.width=7>>=
p1 <- plot(
  cregg::cj(
     bechtel, 
     choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj,
     weights = ~ weight,
     id = ~ ID,
     estimate = "amce"
  ), vline = 0
) + 
  ggplot2::scale_x_continuous(
    limits = c(-0.4, 0.4), 
    breaks = c(-0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4)
  ) + 
  ggplot2::geom_text(
    aes(label = ifelse(is.na(std.error), "", sprintf("%0.2f (%0.2f)", estimate, std.error))),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
 ) + 
 theme(legend.position="none")
p2 <- plot(
  cregg::cj(
     bechtel, 
     choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj,
     weights = ~ weight,
     id = ~ ID,
     estimate = "mm",
     h0 = 0.5
  ), vline = 0.5
) + 
  ggplot2::scale_x_continuous(
    limits = c(0.1, 0.9), 
    breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
  ) + 
  ggplot2::geom_text(
    aes(label = sprintf("%0.2f (%0.2f)", estimate, std.error)),
    colour = "black", 
    size = 2,
    position = position_nudge(y = .5)
  ) + 
  theme(legend.position="none")
gridExtra::grid.arrange(p1, p2, ncol=1, nrow=2)
@

%\clearpage

%\subsection{Numerical Results: AMCEs}

<<bechtel_amce_table, dependson=c("data_bechtel"), results="asis", eval=FALSE>>=
print(xtable::xtable(
  cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, estimate = "amce"
)[c("feature", "level", "estimate", "std.error", "z")],
    digits = 2, align = c("l", "l", "p{3in}", "r", "r", "r")
), include.rownames = FALSE, size = "footnotesize")
@

%\clearpage

%\subsection{Numerical Results: MMs}

<<bechtel_mm_table, dependson=c("data_bechtel"), results="asis", eval=FALSE>>=
print(xtable::xtable(
  cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, estimate = "mm", h0 = 0.5)[c("feature", "level", "estimate", "std.error", "z")],
    digits = 2, align = c("l", "l", "p{3in}", "r", "r", "r")
), include.rownames = FALSE, size = "footnotesize")
@


\clearpage

\subsection{Subgroup Analysis using AMCEs: Country}

<<bechtel_subgroup_amce, dependson=c("data_bechtel")>>=
plot(cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, by = ~ country, estimate = "amce"), group = "BY", vline = 0)
@

%\clearpage

<<bechtel_subgroup_amce_table, dependson=c("data_bechtel"), results="asis", eval=FALSE>>=
cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, by = ~ country, estimate = "amce")
@

\clearpage

\subsection{Subgroup Analysis using MMs: Country}

<<bechtel_subgroup_mm, dependson=c("data_bechtel")>>=
plot(cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, by = ~ country, estimate = "mm"), group = "BY", vline = 0.5)
@

%\clearpage

<<bechtel_subgroup_mm_table, dependson=c("data_bechtel"), results="asis", eval=FALSE>>=
cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, by = ~ country, estimate = "mm")
@

\clearpage

\subsection{Subgroup Analysis using AMCEs: Environmentalism}

<<bechtel_subgroup_amce_environmentalism, dependson=c("data_bechtel")>>=
plot(cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, estimate = "amce", by = ~ environmentalism), group = "BY", vline = 0)
@

\clearpage

\subsection{Subgroup Analysis using MMs: Environmentalism}

<<bechtel_subgroup_mm_environmentalism, dependson=c("data_bechtel")>>=
plot(cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, weights = ~ weight, id = ~ ID, estimate = "mm", by = ~ environmentalism), group = "BY", vline = 0.5)
@

\clearpage

\subsection{Subgroup Analysis using AMCEs: Reciprocity}

<<bechtel_subgroup_amce_reciprocity, dependson=c("data_bechtel")>>=
plot(
  cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, 
	 weights = ~ weight, id = ~ ID, estimate = "amce", by = ~ reciprocity
  ), 
group = "BY", vline = 0)
@

\clearpage

\subsection{Subgroup Analysis using MMs: Reciprocity}

<<bechtel_subgroup_mm_reciprocity, dependson=c("data_bechtel")>>=
plot(
  cj(bechtel, choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj,
     weights = ~ weight, id = ~ ID, estimate = "mm", by = ~ reciprocity
  ),
group = "BY", vline = 0.5)
@

\clearpage

\subsection{Nested Model Comparison: Country}

<<bechtel_country_anova>>=
cj_anova(subset(bechtel, !is.na(country)), choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, id = ~ ID, by = ~ country)
@

\subsection{Nested Model Comparison: Environmentalism}

<<bechtel_environmentalism_anova>>=
cj_anova(subset(bechtel, !is.na(environmentalism)), choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, id = ~ ID, by = ~ environmentalism)
@

\clearpage

\subsection{Nested Model Comparison: Reciprocity}

<<bechtel_reciprocity_anova>>=
cj_anova(subset(bechtel, !is.na(reciprocity)), choice_cj ~ cost_cj + distrib_cj + ctries_cj + emissions_cj + sanctions_cj + monitoring_cj, id = ~ ID, by = ~ reciprocity)
@

\clearpage


\subsection{Comparison of Alternative Reference Categories}


<<bechtel_reciprocity_reference, fig.height=5, fig.width=7>>=
# ``Individuals who pretreatment exhibit reciprocal behavior [...] are almost twice as sensitive to both the number of countries participating and the proportion of emissions represented than individuals who do not'' (13768). This sensitivity is accurately stated but the use of conditional AMCEs implies that the subgroup difference is due to differences in favorability toward agreements involving 20 countries and differing opinions over agreements with 80 or 160 countries, while in reality the differences lie in 20-country and 160-country agreements and the groups do not differ in their favorability toward 80-country agreements at all. 
plot(cj(bechtel, choice_cj ~ ctries_cj + emissions_cj, weights = ~ weight, id = ~ ID, estimate = "mm", by = ~ reciprocity), group = "BY", vline = 0.5)
@

\clearpage

\section*{}

\noindent This paper was built using \texttt{knitr::knit2pdf()} under the following environment:

<<session_info, cache=FALSE>>=
sessionInfo()
@

\end{document}
